just allow CPP/CUDA/Metal to legalize their Matrix inputs.

Memory packing still fails because cpp/cuda/metal do not manage it.
ArielG-NV · Jul 16, 2024 · 47995ab · 47995ab
1 parent 59343c1
commit 47995ab
Show file tree

Hide file tree

Showing 7 changed files with 142 additions and 32 deletions.
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
@@ -207,6 +207,12 @@ union Union64
     double d;
 };
 
+template<typename T>
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float make_float(T val)
+{
+    return (float)val;
+}
+
 SLANG_FORCE_INLINE SLANG_CUDA_CALL float _slang_fmod(float x, float y)
 {
     return ::fmodf(x, y);

diff --git a/source/slang/slang-compiler.h b/source/slang/slang-compiler.h
@@ -1755,6 +1755,8 @@ namespace Slang
     /// Are we generating code for a CUDA API (CUDA / OptiX)?
     bool isCUDATarget(TargetRequest* targetReq);
 
+    // Are we generating code for a CPU target
+    bool isCPUTarget(TargetRequest* targetReq);
 
         /// A request to generate output in some target format.
     class TargetRequest : public RefObject

diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
@@ -1257,15 +1257,7 @@ Result linkAndOptimizeIR(
     if (requiredLoweringPassSet.meshOutput)
         legalizeMeshOutputTypes(irModule);
 
-    if (options.shouldLegalizeExistentialAndResourceTypes)
-    {
-        if (!isMetalTarget(targetRequest))
-        {
-            // We need to lower any types used in a buffer resource (e.g. ContantBuffer or StructuredBuffer) into
-            // a simple storage type that has target independent layout based on the kind of buffer resource.
-            lowerBufferElementTypeToStorageType(targetProgram, irModule);
-        }
-    }
+    lowerBufferElementTypeToStorageType(targetProgram, irModule);
 
     // Rewrite functions that return arrays to return them via `out` parameter,
     // since our target languages doesn't allow returning arrays.

diff --git a/source/slang/slang-ir-lower-buffer-element-type.cpp b/source/slang/slang-ir-lower-buffer-element-type.cpp
@@ -877,7 +877,9 @@ namespace Slang
     void lowerBufferElementTypeToStorageType(TargetProgram* target, IRModule* module, bool lowerBufferPointer)
     {
         SlangMatrixLayoutMode defaultMatrixMode = (SlangMatrixLayoutMode)target->getOptionSet().getMatrixLayoutMode();
-        if (defaultMatrixMode == SLANG_MATRIX_LAYOUT_MODE_UNKNOWN)
+        if ((isCPUTarget(target->getTargetReq()) || isCUDATarget(target->getTargetReq()) || isMetalTarget(target->getTargetReq())))
+            defaultMatrixMode = SLANG_MATRIX_LAYOUT_ROW_MAJOR;
+        else if (defaultMatrixMode == SLANG_MATRIX_LAYOUT_MODE_UNKNOWN)
             defaultMatrixMode = SLANG_MATRIX_LAYOUT_ROW_MAJOR;
         LoweredElementTypeContext context(target, lowerBufferPointer, defaultMatrixMode);
         context.processModule(module);

diff --git a/tests/compute/column-major.slang b/tests/compute/column-major.slang
@@ -1,33 +1,59 @@
 // column-major.slang
 
-// Unfortunately CPU and CUDA only work with row layout, so they have to be disabled here.
-
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -output-using-type -compile-arg -O3 -shaderobj
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -output-using-type -shaderobj -Xslang -matrix-layout-column-major
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -output-using-type -dx12 -shaderobj -Xslang -matrix-layout-column-major
-//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -output-using-type -shaderobj -Xslang -matrix-layout-column-major
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -output-using-type -shaderobj
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -output-using-type -mtl -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cpu -compute -compile-arg -O3 -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-slang -compute -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-slang -compute -dx12 -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute -emit-spirv-via-glsl -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cuda -compute -shaderobj -Xslang -matrix-layout-column-major
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-mtl -compute -shaderobj -Xslang -matrix-layout-column-major
 
 // This data is in column major layout order.... 
 //TEST_INPUT:cbuffer(data=[1.0 0.0 0.0 10.0  0.0 1.0 0.0 20.0  0.0 0.0 1.0 30.0  0.0 0.0 0.0 1.0]):name matrixBuffer
 
 ConstantBuffer<float4x4> matrixBuffer;
 
-//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name output
-RWStructuredBuffer<float> output;
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name output
+RWStructuredBuffer<uint> output;
+
+bool floatCheck(float data, float valueToCheckFor)
+{
+    return data < (valueToCheckFor + 0.001) && data > valueToCheckFor - 0.001;
+}
 
 [numthreads(1, 1, 1)]
 void computeMain(uint3 tid : SV_DispatchThreadID)
 {
     float4 v = float4(1, 2, 3, 1);
 
-    float4x4 M = matrixBuffer;
+    float4x4 M1 = matrixBuffer;
 
-    float4 r = mul(v, M);
-
-    output[0] = r.x;
-    output[1] = r.y;
-    output[2] = r.z;
-    output[3] = r.w;
+    float4 r = mul(v, M1);
+
+    float4x4 M2 = mul(M1, M1);
+
+    float4x4 M3 = float4x4(
+            1.0, 0.0, 0.0, 10.0, 
+            0.0, 1.0, 0.0, 20.0,
+            0.0, 0.0, 1.0, 30.0,
+            0.0, 0.0, 0.0, 1.0
+        );
+
+    output[0] = uint(true
+            && floatCheck(r.x, 11)
+            && floatCheck(r.y, 22)
+            && floatCheck(r.z, 33)
+            && floatCheck(r.w, 1)
+
+            && floatCheck(M1[3][0], 10)
+
+            && floatCheck(M2[3][0], 20)
+            && floatCheck(M2._41, 20)
+            && floatCheck(M2._41_32[0], 20)
+            && floatCheck(M2._33_42[0], 1)
+            && floatCheck(M2._42_33[0], 40)
+
+            && floatCheck(M3[0][3], 10)
+        );
+    //BUF: 1
 }
diff --git a/tests/compute/column-major.slang.expected.txt b/tests/compute/column-major.slang.expected.txt
diff --git a/tests/compute/memory-packing.slang b/tests/compute/memory-packing.slang
@@ -0,0 +1,87 @@
+// column-major-with-row-major-operations.slang
+
+// Metal/CPP/CUDA do not correctly deal with packing currently.
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cpu -compute
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-cuda -compute
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-mtl -compute
+
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-slang -compute -dx12
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=BUF):-vk -compute -emit-spirv-via-glsl
+
+//TEST_INPUT:cbuffer(data=[1.0 2.0 3.0 0.0  4.0 5.0 6.0 0.0  7.0 8.0 9.0 0]):name matrixTest
+ConstantBuffer<row_major float3x3> matrixTestCBuf1;
+
+//TEST_INPUT:cbuffer(data=[1.0 4.0 7.0 0.0  2.0 5.0 8.0 0.0  3.0 6.0 9.0 0.0]):name colMatrixBuffer
+ConstantBuffer<column_major float3x3> matrixTestCBuf2;
+
+// struct float3x3{float3[3]
+// {
+//     float3 data1; 
+//     float pad1;
+//     float3 data2;
+//     float pad2;
+//     float3 data3;
+//     float pad3;
+// }
+
+struct NeedsPadding
+{
+    float2 data1;
+    // float2 pad1;
+    float2 data2;
+    // float2 pad2;
+};
+//TEST_INPUT:cbuffer(data=[1.0 2.0 100 100  3.0 4.0 100 100]):name structTest
+ConstantBuffer<NeedsPadding> structTestCBuf1;
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name output
+RWStructuredBuffer<uint> output;
+
+bool floatCheck(float data, float valueToCheckFor)
+{
+    return data < (valueToCheckFor + 0.001) && data > valueToCheckFor - 0.001;
+}
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 tid : SV_DispatchThreadID)
+{
+    float3x3 matrixTest1;
+    matrixTest1 = matrixTestCBuf1;
+
+    float3x3 matrixTest2;
+    matrixTest2 = matrixTestCBuf2;
+
+    NeedsPadding structTest1;
+
+    // Note: default is column major
+    output[0] = bool(true
+            && floatCheck(matrixTest1[0][0], 1)
+            && floatCheck(matrixTest1[0][1], 2)
+            && floatCheck(matrixTest1[0][2], 3)
+            && floatCheck(matrixTest1[1][0], 4)
+            && floatCheck(matrixTest1[1][1], 5)
+            && floatCheck(matrixTest1[1][2], 6)
+            && floatCheck(matrixTest1[2][0], 7)
+            && floatCheck(matrixTest1[2][1], 8)
+            && floatCheck(matrixTest1[2][2], 9)
+
+
+            && floatCheck(matrixTest2[0][0], 1)
+            && floatCheck(matrixTest2[0][1], 2)
+            && floatCheck(matrixTest2[0][2], 3)
+            && floatCheck(matrixTest2[1][0], 4)
+            && floatCheck(matrixTest2[1][1], 5)
+            && floatCheck(matrixTest2[1][2], 6)
+            && floatCheck(matrixTest2[2][0], 7)
+            && floatCheck(matrixTest2[2][1], 8)
+            && floatCheck(matrixTest2[2][2], 9)
+
+
+            && floatCheck(structTest1.data1[0], 1)
+            && floatCheck(structTest1.data1[1], 2)
+            && floatCheck(structTest1.data2[0], 3)
+            && floatCheck(structTest1.data2[1], 4)
+        ) ? 1 : 0;
+    //BUF: 1
+}