API Proposal : Arm TableVectorLookup and TableVectorExtension intrinsics #1277

TamarChristinaArm · 2020-01-03T17:34:40Z

The instructions TBL and TBX do a table lookup using a vector of indices and a "table" which is created by concatenating all the input vectors together.

The difference between a TBL and a TBX is that a TBL will return 0 for any indices that are out of range and TBX is a destructive variant which will leave the corresponding value to the out of range index untouched.

I've modelled them using tuples for the register list as the instructions have an explicit requirement that the input vectors must use consecutive registers.

This fits in the general design of https://github.com/dotnet/corefx/issues/26574

namespace System.Runtime.Intrinsics.Arm
{
    public abstract partial class AdvSimd : System.Runtime.Intrinsics.Arm.ArmBase
    {
        /// <summary>
        ///  uint8x8_t vqvtbx1_u8(uint8x8_t r, uint8x16_t t, uint8x8_t idx)
        ///   A32: VTBX Dd,{Dn, Dn+1},Dm
        ///   A64: TBX Vd.8B,{Vn.16B},Vm.8B
        /// </summary>
        public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<byte>> list, Vector64<byte> idx) { throw null; }

        /// <summary>
        ///  uint8x8_t vqvtbl1_u8(uint8x16_t t, uint8x8_t idx)
        ///   A32: VTBL Dd,{Dn, Dn+1},Dm
        ///   A64: TBL Vd.8B,{Vn.16B},Vm.8B
        /// </summary>
        public static Vector64<byte> VectorTableLookup (ValueTuple<Vector128<byte>> list, Vector64<byte> idx) { throw null; }

        /// <summary>
        ///  uint8x8_t vqvtbx2_u8(uint8x8_t r, uint8x16x2_t t, uint8x8_t idx)
        ///   A32: VTBX Dd,{Dn, Dn+1, Dn+2, Dn+3},Dm
        ///   A64: TBX Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
        /// </summary>
        public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<byte>,Vector128<byte>> list, Vector64<byte> idx) { throw null; }

        /// <summary>
        ///  uint8x8_t vqvtbl2_u8(uint8x16x2_t t, uint8x8_t idx)
        ///   A32: VTBL Dd,{Dn, Dn+1, Dn+2, Dn+3},Dm
        ///   A64: TBL Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
        /// </summary>
        public static Vector64<byte> VectorTableLookup (ValueTuple<Vector128<byte>,Vector128<byte>> list, Vector64<byte> idx) { throw null; }

        /// <summary>
        ///  int8x8_t vqvtbx1_s8(int8x8_t r, int8x16_t t, uint8x8_t idx)
        ///   A32: VTBX Dd,{Dn, Dn+1},Dm
        ///   A64: TBX Vd.8B,{Vn.16B},Vm.8B
        /// </summary>
        public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

        /// <summary>
        ///  int8x8_t vqvtbl1_s8(int8x16_t t, uint8x8_t idx)
        ///   A32: VTBL Dd,{Dn, Dn+1},Dm
        ///   A64: TBL Vd.8B,{Vn.16B},Vm.8B
        /// </summary>
        public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

        /// <summary>
        ///  int8x8_t vqvtbx2_s8(int8x8_t r, int8x16x2_t t, uint8x8_t idx)
        ///   A32: VTBX Dd,{Dn, Dn+1, Dn+2, Dn+3},Dm
        ///   A64: TBX Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
        /// </summary>
        public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<sbyte>,Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

        /// <summary>
        ///  int8x8_t vqvtbl2_s8(int8x16x2_t t, uint8x8_t idx)
        ///   A32: VTBL Dd,{Dn, Dn+1, Dn+2, Dn+3},Dm
        ///   A64: TBL Vd.8B,{Vn.16B - Vn+1.16B},Vm.8B
        /// </summary>
        public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>,Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

        public new abstract partial class Arm64 : System.Runtime.Intrinsics.Arm.ArmBase.Arm64
        {
           /// <summary>
           ///  uint8x8_t vqvtbx3_u8(uint8x8_t r, uint8x16x3_t t, uint8x8_t idx)
           ///   A64: TBX Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vqvtbl3_u8(uint8x16x3_t t, uint8x8_t idx)
           ///   A64: TBL Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableLookup (ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vqvtbx4_u8(uint8x8_t r, uint8x16x4_t t, uint8x8_t idx)
           ///   A64: TBX Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vqvtbl4_u8(uint8x16x4_t t, uint8x8_t idx)
           ///   A64: TBL Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableLookup (ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vqvtbx3_s8(int8x8_t r, int8x16x3_t t, uint8x8_t idx)
           ///   A64: TBX Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vqvtbl3_s8(int8x16x3_t t, uint8x8_t idx)
           ///   A64: TBL Vd.8B,{Vn.16B - Vn+2.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vqvtbx4_s8(int8x8_t r, int8x16x4_t t, uint8x8_t idx)
           ///   A64: TBX Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vqvtbl4_s8(int8x16x4_t t, uint8x8_t idx)
           ///   A64: TBL Vd.8B,{Vn.16B - Vn+3.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbx1q_u8(uint8x16_t r, int8x16_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbl1q_u8(uint8x16_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableLookup (ValueTuple<Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbx2q_u8(uint8x16_t r, uint8x16x2_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<byte>,Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbl2q_u8(uint8x16x2_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableLookup (ValueTuple<Vector128<byte>,Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbx3q_u8(uint8x16_t r, uint8x16x3_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbl3q_u8(uint8x16x3_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableLookup (ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbx3q_u8(uint8x16_t r, uint8x16x3_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x16_t vqvtbl3q_u8(uint8x16x3_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
           /// </summary>
           public static Vector128<byte> VectorTableLookup (ValueTuple<Vector128<byte>,Vector128<byte>,Vector128<byte>,Vector128<byte>> list, Vector128<byte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbx1q_s8(int8x16_t r, int8x16_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbl1q_s8(int8x16_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbx2q_s8(int8x16_t r, int8x16x2_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<sbyte>,Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbl2q_s8(int8x16x2_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+1.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>,Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbx3q_s8(int8x16_t r, int8x16x3_t t, uint8x16_t idx)
           ///   A64: TBX Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbl3q_s8(int8x16x3_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+2.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbl4q_s8(int8x16_t r, int8x16x4_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableExtension (Vector128<byte> result, ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x16_t vqvtbl4q_s8(int8x16x4_t t, uint8x16_t idx)
           ///   A64: TBL Vd.16B,{Vn.16B - Vn+3.16B},Vm.16B
           /// </summary>
           public static Vector128<sbyte> VectorTableLookup (ValueTuple<Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>,Vector128<sbyte>> list, Vector128<sbyte> idx) { throw null; }
        }

        public new abstract partial class Arm32 : System.Runtime.Intrinsics.Arm.ArmBase.Arm32
        {
           /// <summary>
           ///  uint8x8_t vtbx1_u8(uint8x8_t r, uint8x8_t a, uint8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableLookup (ValueTuple<Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbx2_u8(uint8x8_t r, uint8x8x2_t a, uint8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<byte>,Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableLookup (ValueTuple<Vector64<byte>,Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbx3_u8(uint8x8_t r, uint8x8x3_t a, uint8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<byte>,Vector64<byte>,Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableLookup (ValueTuple<Vector64<byte>,Vector64<byte>,Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbx4_u8(uint8x8_t r, uint8x8x4_t a, uint8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<byte>,Vector64<byte>,Vector64<byte>,Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<byte> VectorTableLookup (ValueTuple<Vector64<byte>,Vector64<byte>,Vector64<byte>,Vector64<byte>> list, Vector64<byte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbx1_s8(int8x8_t r, int8x8_t a, int8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbl1_s8(int8x8_t a, int8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbx2_s8(int8x8_t r, int8x8x2_t a, int8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<sbyte>,Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector64<sbyte>,Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbx3_s8(int8x8_t r, int8x8x3_t a, int8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<sbyte>,Vector64<sbyte>,Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector64<sbyte>,Vector64<sbyte>,Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbx4_s8(int8x8_t r, int8x8x4_t a, int8x8_t idx)
           ///   A32: TBX Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableExtension (Vector64<byte> result, ValueTuple<Vector64<sbyte>,Vector64<sbyte>,Vector64<sbyte>,Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }

           /// <summary>
           ///  int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t idx)
           ///   A32: TBL Vd.8B,{Vn.16B,Vn+1.16B},Vm.8B
           /// </summary>
           public static Vector64<sbyte> VectorTableLookup (ValueTuple<Vector64<sbyte>,Vector64<sbyte>,Vector64<sbyte>,Vector64<sbyte>> list, Vector64<sbyte> idx) { throw null; }
        }
    }
}

/cc @tannergooding @CarolEidt @echesakovMSFT

The text was updated successfully, but these errors were encountered:

Gnbrkm41 · 2020-01-04T03:54:38Z

ValueTuple with single element sounds odd, why not use the vector itself?

TamarChristinaArm · 2020-01-06T12:34:45Z

No particular reason other than uniformity of syntax and because the instruction itself expects a list there. But I concede that C# doesn't have very good support for single element tuples so I have no objections to changing it.

@CarolEidt I have been wondering about the register allocation for this though. If I understand correctly the RA works just by looking at the type right? so It doesn't know which function it's doing the allocations for?

In which case maybe we should have a new type rather than using Tuples so the constraint for sequential registers don't get applied to all tuples? Since that would created unneeded register pressure in a lot of code and make some un-allocatable. (e.g. a 30 element tuple for instance)

terrajobst · 2020-03-17T19:01:20Z

Video

We should consider using a custom type rather than a tuple to repesent lists, because we can expose more operations later. It also means people can't do things like taking the address of the tuple field. However, the nice thing about using tuples is language syntax. But we can probably get the same benefits if we were to define implicit conversion between the two. We can also a Deconstruct directly on the custom type, allowing for tuple-style deconstructing into locals.
We don't think we're ready for the overloads that take multiple lists with more than value for .NET 5, due to necessary JIT implementation work
We approved the Arm32 version but commented them out b/c we won't be shipping them in .NET 5

namespace System.Runtime.Intrinsics.Arm
{
    public partial class AdvSimd
    {
        public static Vector64<byte>  VectorTableLookup(Vector128<byte>  table, Vector64<byte>  byteIndexes);
        public static Vector64<sbyte> VectorTableLookup(Vector128<sbyte> table, Vector64<sbyte> byteIndexes);

        public static Vector64<byte>  VectorTableLookupExtension(Vector64<byte> defaultValues, Vector128<byte>  table, Vector64<byte>  byteIndexes);
        public static Vector64<sbyte> VectorTableLookupExtension(Vector64<byte> defaultValues, Vector128<sbyte> table, Vector64<sbyte> byteIndexes);

        // public partial class Arm32
        // {
        //    public static Vector64<byte>  VectorTableLookup(Vector64<byte>  table, Vector64<byte>  byteIndexes);
        //    public static Vector64<sbyte> VectorTableLookup(Vector64<sbyte> table, Vector64<sbyte> byteIndexes);
        //
        //    public static Vector64<byte>  VectorTableLookupExtension(Vector64<byte> defaultValues, Vector64<byte>  table, Vector64<byte> byteIndexes);
        //    public static Vector64<sbyte> VectorTableLookupExtension(Vector64<byte> defaultValues, Vector64<sbyte> table, Vector64<sbyte> byteIndexes);
        // }

        public partial class Arm64
        {
           public static Vector128<byte>  VectorTableLookup(Vector128<byte>  table, Vector128<byte> byteIndexes);
           public static Vector128<sbyte> VectorTableLookup(Vector128<sbyte> table, Vector128<sbyte> byteIndexes);

           public static Vector128<byte>  VectorTableLookupExtension(Vector128<byte> defaultValues, Vector128<byte>  table, Vector128<byte>  byteIndexes);
           public static Vector128<sbyte> VectorTableLookupExtension(Vector128<byte> defaultValues, Vector128<sbyte> table, Vector128<sbyte> byteIndexes);
        }
    }
}

TamarChristinaArm · 2020-04-20T16:33:55Z

@tannergooding @echesakovMSFT Could use some advice on how to best handle these TBLs.

The problem is that we have the same API name with the same elements (they're all 8-bit element vectors) but that vary in essentially the function arity. This means I need to use the same instruction name INS_tbl but still have a way to distinguish between the different forms..

I'm currently thinking I can do this by having some new fmts.

DV_3H1reg
DV_3H2reg
DV_3H3reg
DV_3H4reg

Alternatively I can have just 1 new one DV_3H which also has a len field to fill in.

The problem I have here is that I am not able to distinguish between the instructions then in placed where I don't have access to the value. e.g. not sure how I need to handle getInsExecutionCharacteristics.

Alternatively I can re-use an existing fmt DV_3C and encode len directly into the the different instructions and use emitInsCode to get the instruction and extract the value in order to distinguish between the different overloads.

or maybe something else entirely?

echesakov · 2020-04-20T17:32:20Z

@TamarChristinaArm You should use approach I've taken in #33461 where I added suffixes _2regs, _3regs and _4regs for multiple structures variants of ld1, st1 instructions. You would also need to update insGetLoadStoreRegisterListSize when you add tbl, tbx instructions. Other than accepting a register list as an argument the instructions encoding fits in DV_3C category.

BTW, I added some helper functions for displaying consecutive register lists so you can use them when implementing emitInsDisp.

TamarChristinaArm · 2020-04-21T10:11:39Z

@TamarChristinaArm You should use approach I've taken in #33461 where I added suffixes _2regs, _3regs and _4regs for multiple structures variants of ld1, st1 instructions.

@echesakovMSFT I saw that but it wasn't entirely clear to me why you choose that approach. I didn't see any usage of the _2regs, _3regs yet since we don't have RA support for it yet, but when we do this means you can't use the table driven approach anymore right? since you now need to emit different instructions based on overloads of the intrinsics but for the same datatype. Though I am probably missing something.

Wouldn't it be better to have the same instruction name but overload on the fmt? That way you can still use the table setup for all overloads and it's only during emit that you have to set the right fmt while you validate the parameters. This seems to be done by other instructions already.

echesakov · 2020-04-21T17:04:15Z

@TamarChristinaArm For these intrinsics we can use SpecialCodeGen - the decision on what instruction to emit will be done in codegen not emitter in this case. I don't see this as a big issue - there is a handful of intrinsics that support multiple regs - ld[1-4],st[1-4],tbl and tbx.

We also don't know the final API surface even to say now if it's going to be needed. For example, if we had LoadVector128x2 LoadWithDeInterleavingVector128x2 then these would be mapped directly to ld1_2regs, ld2 and use table-driven codegen.

Wouldn't it be better to have the same instruction name but overload on the fmt? That way you can still use the table setup for all overloads and it's only during emit that you have to set the right fmt while you validate the parameters. This seems to be done by other instructions already.

It's an option - but wouldn't this require defining multiple formats LS_2D[2-4]regs, LS_3F[2-4]regs, LS_2E[2-4]regs - since ld1 and st1 can be one of those? You would also need to pass information in emitIns_R_R_I, emitIns_R_R_R that an instruction uses multiple registers.

TamarChristinaArm · 2020-04-21T17:22:11Z

It's an option - but wouldn't this require defining multiple formats LS_2D[2-4]regs, LS_3F[2-4]regs, LS_2E[2-4]regs - since ld1 and st1 can be one of those?

Yes but there aren't a lot of instructions with this characteristics though.

You would also need to pass information in emitIns_R_R_I, emitIns_R_R_R that an instruction uses multiple registers.

Yes, though I see your point now. I had thought I could extract this information from the code_t itself, but I hadn't realized that emitInsCode expects you to know the fmt already to do the lookup. So it's a chicken and egg problem unless you pass this information to the function from some place that knows it..

So it's a bit unfortunately but it does look like I'll need to just have 4 different instructions :(

echesakov · 2020-04-21T20:40:05Z

So it's a bit unfortunately but it does look like I'll need to just have 4 different instructions :(

I agree - that there could be more elegant solution to this. I explored other options while implementing ld[1-4], st[1-4] - such as encoding of the number of registers in a register list via emitAttr or emitInsOpts. However, the related changes would be much more disruptive/riskier and also far from ideal so I decided to pick the least worst choice i.e. encoding the information via the instruction opcode.

TamarChristinaArm · 2020-04-27T12:11:54Z

@echesakovMSFT I've been trying to build my changes all morning but they keep failing with /bin/sh: 2: /tmp/tmp84f5a862456d4c7998e56b7a01bbbb97.exec.cmd: /home/tamar/git/runtime/artifacts/bin/coreclr/Linux.arm64.Debug/sharedFramework/crossgen: not found

when I just do ./build.sh. I don't understand why it's trying to do a crossgen. I'm building on native aarch64 hardware.

Do you know what's going on?

TamarChristinaArm · 2020-04-27T15:04:08Z

ah, nvm, I see that the real error was hidden much much higher up..

john-h-k · 2020-05-05T16:09:59Z

Should this be closed now? Does that PR implement them?

TamarChristinaArm · 2020-05-05T16:21:31Z

No, it only implements the API approved parts. i.e. the single register versions. The multi-register version aren't implemented as they require changes to the register allocator.

echesakov · 2020-06-12T19:39:19Z

Moving to the future since the multiple register variants of tbl and tbx are out of scope for 5.0

We write to stderr from CoreLib for two reasons: 1. FailFast 2. Unhandled exception (also FailFast in a way) We're using a roundabout way to use `Console.Error.WriteLine` but that has problems for FailFast because we then don't fail fast enough (dotnet#1277). This inlines enough of writing to stderr that we can get rid of System.Console dependency. Fixes dotnet#1277.

tannergooding · 2023-02-24T18:07:57Z

Closing this as the single-register variants have been implemented and the multiple-register variants are tracked by #81599

Dotnet-GitSync-Bot added area-System.Numerics untriaged New issue has not been triaged by the area owner labels Jan 3, 2020

TamarChristinaArm changed the title ~~API Proposal : Arm TableVectorLookup and TableVectorExtension~~ API Proposal : Arm TableVectorLookup and TableVectorExtension intrinsics Jan 3, 2020

tannergooding added api-ready-for-review and removed untriaged New issue has not been triaged by the area owner labels Mar 3, 2020

terrajobst added area-System.Runtime.Intrinsics and removed area-System.Numerics labels Mar 4, 2020

terrajobst added api-approved API was approved in API review, it can be implemented and removed api-ready-for-review labels Mar 17, 2020

tannergooding added the arch-arm64 label Mar 25, 2020

john-h-k mentioned this issue Apr 24, 2020

Optimize System.Buffers for arm64 using cross-platform intrinsics #35033

Closed

2 tasks

TamarChristinaArm mentioned this issue Apr 29, 2020

Add VectorTableList and TableVectorExtension intrinsics #35600

Merged

JulieLeeMSFT added this to the 5.0 milestone May 18, 2020

echesakov removed this from the 5.0 milestone Jun 12, 2020

echesakov added this to the Future milestone Jun 12, 2020

echesakov mentioned this issue Oct 20, 2020

[Arm64] Planned JIT work in .NET 6 #43629

Closed

29 tasks

kunalspathak mentioned this issue Mar 28, 2022

Improving ARM64 Performance in .NET 7.0 #64820

Closed

32 tasks

kunalspathak mentioned this issue Oct 13, 2022

Improving Arm64 Performance in .NET 8.0 #77010

Closed

28 tasks

kunalspathak self-assigned this Jan 4, 2023

kunalspathak mentioned this issue Jan 6, 2023

Arm64: Implement VectorTableLookup/VectorTableLookupExtension intrinsinsic + Consecutive registers support #80297

Merged

6 tasks

tannergooding mentioned this issue Feb 3, 2023

Arm VectorTableLookup and VectorTableExtension - Part 2 #81599

Closed

tannergooding closed this as completed Feb 24, 2023

ghost locked as resolved and limited conversation to collaborators Mar 26, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

API Proposal : Arm TableVectorLookup and TableVectorExtension intrinsics #1277

API Proposal : Arm TableVectorLookup and TableVectorExtension intrinsics #1277

TamarChristinaArm commented Jan 3, 2020

Gnbrkm41 commented Jan 4, 2020

TamarChristinaArm commented Jan 6, 2020

terrajobst commented Mar 17, 2020 •

edited

Loading

TamarChristinaArm commented Apr 20, 2020

echesakov commented Apr 20, 2020 •

edited

Loading

TamarChristinaArm commented Apr 21, 2020 •

edited

Loading

echesakov commented Apr 21, 2020

TamarChristinaArm commented Apr 21, 2020

echesakov commented Apr 21, 2020

TamarChristinaArm commented Apr 27, 2020 •

edited

Loading

TamarChristinaArm commented Apr 27, 2020

john-h-k commented May 5, 2020

TamarChristinaArm commented May 5, 2020

echesakov commented Jun 12, 2020 •

edited

Loading

tannergooding commented Feb 24, 2023

API Proposal : Arm TableVectorLookup and TableVectorExtension intrinsics #1277

API Proposal : Arm TableVectorLookup and TableVectorExtension intrinsics #1277

Comments

TamarChristinaArm commented Jan 3, 2020

Gnbrkm41 commented Jan 4, 2020

TamarChristinaArm commented Jan 6, 2020

terrajobst commented Mar 17, 2020 • edited Loading

TamarChristinaArm commented Apr 20, 2020

echesakov commented Apr 20, 2020 • edited Loading

TamarChristinaArm commented Apr 21, 2020 • edited Loading

echesakov commented Apr 21, 2020

TamarChristinaArm commented Apr 21, 2020

echesakov commented Apr 21, 2020

TamarChristinaArm commented Apr 27, 2020 • edited Loading

TamarChristinaArm commented Apr 27, 2020

john-h-k commented May 5, 2020

TamarChristinaArm commented May 5, 2020

echesakov commented Jun 12, 2020 • edited Loading

tannergooding commented Feb 24, 2023

terrajobst commented Mar 17, 2020 •

edited

Loading

echesakov commented Apr 20, 2020 •

edited

Loading

TamarChristinaArm commented Apr 21, 2020 •

edited

Loading

TamarChristinaArm commented Apr 27, 2020 •

edited

Loading

echesakov commented Jun 12, 2020 •

edited

Loading