Skip to content

Commit

Permalink
Maximize ALU utilization by avoiding pipeline bubbles
Browse files Browse the repository at this point in the history
  • Loading branch information
PENGUINLIONG committed Oct 30, 2020
1 parent 48083a2 commit 07ff520
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 65 deletions.
31 changes: 18 additions & 13 deletions src/kernels/compute_dp_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ MSTRINGIFY(
\n#undef MAD_16
\n#undef MAD_64
\n
\n#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
\n#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
\n#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
\n#define MAD_4(x, y, z) z += mad(y, x, y); z += mad(x, y, x); z += mad(y, x, y); z += mad(x, y, x);
\n#define MAD_16(x, y, z) MAD_4(x, y, z); MAD_4(x, y, z); MAD_4(x, y, z); MAD_4(x, y, z);
\n#define MAD_64(x, y, z) MAD_16(x, y, z); MAD_16(x, y, z); MAD_16(x, y, z); MAD_16(x, y, z);
\n

\n
Expand All @@ -28,68 +28,73 @@ __kernel void compute_dp_v1(__global double *ptr, double _A)
{
double x = _A;
double y = (double)get_local_id(0);
double z = 0;

for(int i=0; i<128; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = y;
ptr[get_global_id(0)] = z;
}


__kernel void compute_dp_v2(__global double *ptr, double _A)
{
double2 x = (double2)(_A, (_A+1));
double2 y = (double2)get_local_id(0);
double2 z = 0;

for(int i=0; i<64; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1);
ptr[get_global_id(0)] = (z.S0) + (z.S1);
}

__kernel void compute_dp_v4(__global double *ptr, double _A)
{
double4 x = (double4)(_A, (_A+1), (_A+2), (_A+3));
double4 y = (double4)get_local_id(0);
double4 z = 0;

for(int i=0; i<32; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3);
}


__kernel void compute_dp_v8(__global double *ptr, double _A)
{
double8 x = (double8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
double8 y = (double8)get_local_id(0);
double8 z = 0;

for(int i=0; i<16; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3) + (z.S4) + (z.S5) + (z.S6) + (z.S7);
}

__kernel void compute_dp_v16(__global double *ptr, double _A)
{
double16 x = (double16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
(_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
double16 y = (double16)get_local_id(0);
double16 z = 0;

for(int i=0; i<8; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

double2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
double2 t = (z.S01) + (z.S23) + (z.S45) + (z.S67) + (z.S89) + (z.SAB) + (z.SCD) + (z.SEF);
ptr[get_global_id(0)] = t.S0 + t.S1;
}

Expand Down
31 changes: 18 additions & 13 deletions src/kernels/compute_hp_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ MSTRINGIFY(
\n#undef MAD_16
\n#undef MAD_64
\n
\n#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
\n#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
\n#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
\n#define MAD_4(x, y, z) z += mad(y, x, y); z += mad(x, y, x); z += mad(y, x, y); z += mad(x, y, x);
\n#define MAD_16(x, y, z) MAD_4(x, y, z); MAD_4(x, y, z); MAD_4(x, y, z); MAD_4(x, y, z);
\n#define MAD_64(x, y, z) MAD_16(x, y, z); MAD_16(x, y, z); MAD_16(x, y, z); MAD_16(x, y, z);
\n

\n
Expand All @@ -26,13 +26,14 @@ __kernel void compute_hp_v1(__global half *ptr, float _B)
half _A = (half)_B;
half x = _A;
half y = (half)get_local_id(0);
half z = 0;

for(int i=0; i<128; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = y;
ptr[get_global_id(0)] = z;
}


Expand All @@ -41,27 +42,29 @@ __kernel void compute_hp_v2(__global half *ptr, float _B)
half _A = (half)_B;
half2 x = (half2)(_A, (_A+1));
half2 y = (half2)get_local_id(0);
half2 z = 0;

for(int i=0; i<64; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1);
ptr[get_global_id(0)] = (z.S0) + (z.S1);
}

__kernel void compute_hp_v4(__global half *ptr, float _B)
{
half _A = (half)_B;
half4 x = (half4)(_A, (_A+1), (_A+2), (_A+3));
half4 y = (half4)get_local_id(0);
half4 z = 0;

for(int i=0; i<32; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3);
}


Expand All @@ -70,13 +73,14 @@ __kernel void compute_hp_v8(__global half *ptr, float _B)
half _A = (half)_B;
half8 x = (half8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
half8 y = (half8)get_local_id(0);
half8 z = 0;

for(int i=0; i<16; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3) + (z.S4) + (z.S5) + (z.S6) + (z.S7);
}

__kernel void compute_hp_v16(__global half *ptr, float _B)
Expand All @@ -85,13 +89,14 @@ __kernel void compute_hp_v16(__global half *ptr, float _B)
half16 x = (half16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
(_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
half16 y = (half16)get_local_id(0);
half16 z = 0;

for(int i=0; i<8; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

half2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
half2 t = (z.S01) + (z.S23) + (z.S45) + (z.S67) + (z.S89) + (z.SAB) + (z.SCD) + (z.SEF);
ptr[get_global_id(0)] = t.S0 + t.S1;
}

Expand Down
31 changes: 18 additions & 13 deletions src/kernels/compute_int24_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,77 +6,82 @@ MSTRINGIFY(
\n#undef MAD_16INT
\n#undef MAD_64INT
\n
\n#define MAD_4INT(x, y) x = mad24(y,x,y); y = mad24(x,y,x); x = mad24(y,x,y); y = mad24(x,y,x);
\n#define MAD_16INT(x, y) MAD_4INT(x, y); MAD_4INT(x, y); MAD_4INT(x, y); MAD_4INT(x, y);
\n#define MAD_64INT(x, y) MAD_16INT(x, y); MAD_16INT(x, y); MAD_16INT(x, y); MAD_16INT(x, y);
\n#define MAD_4INT(x, y, z) z += mad24(y,x,y); z += mad24(x,y,x); z += mad24(y,x,y); z += mad24(x,y,x);
\n#define MAD_16INT(x, y, z) MAD_4INT(x, y, z); MAD_4INT(x, y, z); MAD_4INT(x, y, z); MAD_4INT(x, y, z);
\n#define MAD_64INT(x, y, z) MAD_16INT(x, y, z); MAD_16INT(x, y, z); MAD_16INT(x, y, z); MAD_16INT(x, y, z);
\n

__kernel void compute_intfast_v1(__global int *ptr, int _A)
{
int x = _A;
int y = (int)get_local_id(0);
int z = 0;

for(int i=0; i<64; i++)
{
MAD_16INT(x, y);
MAD_16INT(x, y, z);
}

ptr[get_global_id(0)] = y;
ptr[get_global_id(0)] = z;
}


__kernel void compute_intfast_v2(__global int *ptr, int _A)
{
int2 x = (int2)(_A, (_A+1));
int2 y = (int2)get_local_id(0);
int2 z = 0;

for(int i=0; i<32; i++)
{
MAD_16INT(x, y);
MAD_16INT(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1);
ptr[get_global_id(0)] = (z.S0) + (z.S1);
}

__kernel void compute_intfast_v4(__global int *ptr, int _A)
{
int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3));
int4 y = (int4)get_local_id(0);
int4 z = 0;

for(int i=0; i<16; i++)
{
MAD_16INT(x, y);
MAD_16INT(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3);
}


__kernel void compute_intfast_v8(__global int *ptr, int _A)
{
int8 x = (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
int8 y = (int8)get_local_id(0);
int8 z = 0;

for(int i=0; i<8; i++)
{
MAD_16INT(x, y);
MAD_16INT(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3) + (z.S4) + (z.S5) + (z.S6) + (z.S7);
}

__kernel void compute_intfast_v16(__global int *ptr, int _A)
{
int16 x = (int16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
(_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
int16 y = (int16)get_local_id(0);
int16 z = 0;

for(int i=0; i<4; i++)
{
MAD_16INT(x, y);
MAD_16INT(x, y, z);
}

int2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
int2 t = (z.S01) + (z.S23) + (z.S45) + (z.S67) + (z.S89) + (z.SAB) + (z.SCD) + (z.SEF);
ptr[get_global_id(0)] = t.S0 + t.S1;
}

Expand Down
31 changes: 18 additions & 13 deletions src/kernels/compute_integer_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,77 +6,82 @@ MSTRINGIFY(
\n#undef MAD_16
\n#undef MAD_64
\n
\n#define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y = (x*y) + x;
\n#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
\n#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
\n#define MAD_4(x, y, z) z += (y*x) + y; z += (x*y) + x; z += (y*x) + y; z += (x*y) + x;
\n#define MAD_16(x, y, z) MAD_4(x, y, z); MAD_4(x, y, z); MAD_4(x, y, z); MAD_4(x, y, z);
\n#define MAD_64(x, y, z) MAD_16(x, y, z); MAD_16(x, y, z); MAD_16(x, y, z); MAD_16(x, y, z);
\n

__kernel void compute_integer_v1(__global int *ptr, int _A)
{
int x = _A;
int y = (int)get_local_id(0);
int z = 0;

for(int i=0; i<64; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = y;
ptr[get_global_id(0)] = z;
}


__kernel void compute_integer_v2(__global int *ptr, int _A)
{
int2 x = (int2)(_A, (_A+1));
int2 y = (int2)get_local_id(0);
int2 z = 0;

for(int i=0; i<32; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1);
ptr[get_global_id(0)] = (z.S0) + (z.S1);
}

__kernel void compute_integer_v4(__global int *ptr, int _A)
{
int4 x = (int4)(_A, (_A+1), (_A+2), (_A+3));
int4 y = (int4)get_local_id(0);
int4 z = 0;

for(int i=0; i<16; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3);
}


__kernel void compute_integer_v8(__global int *ptr, int _A)
{
int8 x = (int8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7));
int8 y = (int8)get_local_id(0);
int8 z = 0;

for(int i=0; i<8; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7);
ptr[get_global_id(0)] = (z.S0) + (z.S1) + (z.S2) + (z.S3) + (z.S4) + (z.S5) + (z.S6) + (z.S7);
}

__kernel void compute_integer_v16(__global int *ptr, int _A)
{
int16 x = (int16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7),
(_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15));
int16 y = (int16)get_local_id(0);
int16 z = 0;

for(int i=0; i<4; i++)
{
MAD_16(x, y);
MAD_16(x, y, z);
}

int2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF);
int2 t = (z.S01) + (z.S23) + (z.S45) + (z.S67) + (z.S89) + (z.SAB) + (z.SCD) + (z.SEF);
ptr[get_global_id(0)] = t.S0 + t.S1;
}

Expand Down
Loading

0 comments on commit 07ff520

Please sign in to comment.