diff --git a/Core/MIPS/MIPSVFPUUtils.cpp b/Core/MIPS/MIPSVFPUUtils.cpp index 7e46209c7628..57d5c4cad80a 100644 --- a/Core/MIPS/MIPSVFPUUtils.cpp +++ b/Core/MIPS/MIPSVFPUUtils.cpp @@ -17,7 +17,8 @@ #include #include -#include +#include +#include #include "Common/BitScan.h" #include "Common/CommonFuncs.h" @@ -245,14 +246,44 @@ void ReadMatrix(float *rd, MatrixSize size, int reg) { default: _assert_msg_(JIT, 0, "%s: Bad matrix size", __FUNCTION__); } - for (int i = 0; i < side; i++) { - for (int j = 0; j < side; j++) { - int index = mtx * 4; - if (transpose) - index += ((row+i)&3) + ((col+j)&3)*32; - else - index += ((col+j)&3) + ((row+i)&3)*32; - rd[j*4 + i] = V(index); + // The voffset ordering is now integrated in these formulas, + // eliminating a table lookup. + const float *v = currentMIPS->v + (size_t)mtx * 16; + if (transpose) { + if (side == 4 && col == 0 && row == 0) { + // Simple 4x4 transpose. TODO: Optimize. + for (int j = 0; j < 4; j++) { + for (int i = 0; i < 4; i++) { + rd[j * 4 + i] = v[i * 4 + j]; + } + } + } else { + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + int index = ((row + i) & 3) * 4 + ((col + j) & 3); + rd[j * 4 + i] = v[index]; + } + } + } + } else { + // Fast path + if (side == 4 && col == 0 && row == 0) { + memcpy(rd, v, sizeof(float) * 16); + /* + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + int index = mtx * 16 + j * 4 + i; + rd[j * 4 + i] = v[index]; + } + } + */ + } else { + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + int index = ((col + j) & 3) * 4 + ((row + i) & 3); + rd[j * 4 + i] = v[index]; + } + } } } } @@ -277,16 +308,49 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) { ERROR_LOG_REPORT(CPU, "Write mask used with vfpu matrix instruction."); } - for (int i = 0; i < side; i++) { - for (int j = 0; j < side; j++) { - // Hm, I wonder if this should affect matrices at all. - if (j != side -1 || !currentMIPS->VfpuWriteMask(i)) { - int index = mtx * 4; - if (transpose) - index += ((row+i)&3) + ((col+j)&3)*32; - else - index += ((col+j)&3) + ((row+i)&3)*32; - V(index) = rd[j*4+i]; + // The voffset ordering is now integrated in these formulas, + // eliminating a table lookup. + float *v = currentMIPS->v + (size_t)mtx * 16; + if (transpose) { + // Fast path + if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) { + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + int index = i * 4 + j; + v[index] = rd[j * 4 + i]; + } + } + } else { + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + // Hm, I wonder if this should affect matrices at all. + if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) { + int index = ((row + i) & 3) * 4 + ((col + j) & 3); + v[index] = rd[j * 4 + i]; + } + } + } + } + } else { + if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) { + memcpy(v, rd, sizeof(float) * 16); + /* + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + int index = j * 4 + i; + v[index] = rd[j * 4 + i]; + } + } + */ + } else { + for (int j = 0; j < side; j++) { + for (int i = 0; i < side; i++) { + // Hm, I wonder if this should affect matrices at all. + if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) { + int index = ((col + j) & 3) * 4 + ((row + i) & 3); + v[index] = rd[j * 4 + i]; + } + } } } }