Skip to content

Commit

Permalink
arm/convolution_3x3_pack1to8_fp16s: prefer ldr/str over ld1/st1
Browse files Browse the repository at this point in the history
Depending on the arch, ldr/str can be faster than ld1/st1, especially
for loading to one lane form. For example, on Cortex A75,

1. execution latency of 'ldr q0' and 'ldr h0' are 5
2. execution latency of 'ld1 {v0.16b}' is 6
3. execution latency of 'ld1 {v0.h}[0]' is 8

On Cortex X3,
1. execution latency of 'ldr q0' and 'ldr h0' are 6
2. execution latency of 'ld1 {v0.16b}' is 6
3. execution latency of 'ld1 {v0.h}[0]' is 8

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
  • Loading branch information
quink-black committed Jul 25, 2024
1 parent 051b04f commit 2ace409
Showing 1 changed file with 34 additions and 34 deletions.
68 changes: 34 additions & 34 deletions src/layer/arm/convolution_3x3_pack1to8_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"sub %0, %0, #64 \n"

"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.8h}, [%1], #16 \n" // r0
"ld1 {v1.4h}, [%1] \n"
"ldr q0, [%1], #16 \n" // r0
"ldr s1, [%1] \n"

"fmla v24.8h, %8.8h, v0.h[0] \n"
"fmla v25.8h, %8.8h, v0.h[1] \n"
Expand Down Expand Up @@ -99,8 +99,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %10.8h, v1.h[1] \n"

"prfm pldl1keep, [%2, #128] \n"
"ld1 {v2.8h}, [%2], #16 \n" // r1
"ld1 {v3.4h}, [%2] \n"
"ldr q2, [%2], #16 \n" // r1
"ldr s3, [%2] \n"

"fmla v24.8h, %11.8h, v2.h[0] \n"
"fmla v25.8h, %11.8h, v2.h[1] \n"
Expand Down Expand Up @@ -130,8 +130,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %13.8h, v3.h[1] \n"

"prfm pldl1keep, [%3, #128] \n"
"ld1 {v4.8h}, [%3], #16 \n" // r2
"ld1 {v5.4h}, [%3] \n"
"ldr q4, [%3], #16 \n" // r2
"ldr s5, [%3] \n"

"fmla v24.8h, %14.8h, v4.h[0] \n"
"fmla v25.8h, %14.8h, v4.h[1] \n"
Expand Down Expand Up @@ -189,7 +189,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3

"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.8h}, [%1] \n" // r0
"ldr q0, [%1] \n" // r0

"fmla v28.8h, %8.8h, v0.h[0] \n"
"fmla v29.8h, %8.8h, v0.h[1] \n"
Expand All @@ -207,7 +207,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %10.8h, v0.h[5] \n"

"prfm pldl1keep, [%2, #128] \n"
"ld1 {v1.8h}, [%2] \n" // r1
"ldr q1, [%2] \n" // r1

"fmla v28.8h, %11.8h, v1.h[0] \n"
"fmla v29.8h, %11.8h, v1.h[1] \n"
Expand All @@ -225,7 +225,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %13.8h, v1.h[5] \n"

"prfm pldl1keep, [%3, #128] \n"
"ld1 {v2.8h}, [%3] \n" // r2
"ldr q2, [%3] \n" // r2

"fmla v28.8h, %14.8h, v2.h[0] \n"
"fmla v29.8h, %14.8h, v2.h[1] \n"
Expand Down Expand Up @@ -274,7 +274,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 sum1

"prfm pldl1keep, [%1, #64] \n"
"ld1 {v0.4h}, [%1] \n" // r0
"ldr d0, [%1] \n" // r0

"fmla v30.8h, %8.8h, v0.h[0] \n"
"fmla v31.8h, %8.8h, v0.h[1] \n"
Expand All @@ -284,7 +284,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %10.8h, v0.h[3] \n"

"prfm pldl1keep, [%2, #64] \n"
"ld1 {v1.4h}, [%2] \n" // r1
"ldr d1, [%2] \n" // r1

"fmla v30.8h, %11.8h, v1.h[0] \n"
"fmla v31.8h, %11.8h, v1.h[1] \n"
Expand All @@ -294,7 +294,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %13.8h, v1.h[3] \n"

"prfm pldl1keep, [%3, #64] \n"
"ld1 {v2.4h}, [%3] \n" // r2
"ldr d2, [%3] \n" // r2

"fmla v30.8h, %14.8h, v2.h[0] \n"
"fmla v31.8h, %14.8h, v2.h[1] \n"
Expand Down Expand Up @@ -332,24 +332,24 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
{
asm volatile(
"prfm pldl1keep, [%0, #128] \n"
"ld1 {v30.8h}, [%0] \n" // sum0
"ldr q30, [%0] \n" // sum0

"prfm pldl1keep, [%1, #64] \n"
"ld1 {v0.4h}, [%1] \n" // r0
"ldr d0, [%1] \n" // r0

"fmla v30.8h, %8.8h, v0.h[0] \n"
"fmla v30.8h, %9.8h, v0.h[1] \n"
"fmla v30.8h, %10.8h, v0.h[2] \n"

"prfm pldl1keep, [%2, #64] \n"
"ld1 {v1.4h}, [%2] \n" // r1
"ldr d1, [%2] \n" // r1

"fmla v30.8h, %11.8h, v1.h[0] \n"
"fmla v30.8h, %12.8h, v1.h[1] \n"
"fmla v30.8h, %13.8h, v1.h[2] \n"

"prfm pldl1keep, [%3, #64] \n"
"ld1 {v2.4h}, [%3] \n" // r2
"ldr d2, [%3] \n" // r2

"fmla v30.8h, %14.8h, v2.h[0] \n"
"fmla v30.8h, %15.8h, v2.h[1] \n"
Expand All @@ -359,7 +359,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"add %2, %2, #2 \n"
"add %3, %3, #2 \n"

"st1 {v30.8h}, [%0], #16 \n"
"str q30, [%0], #16 \n"

: "=r"(outptr0), // %0
"=r"(r0), // %1
Expand Down Expand Up @@ -445,8 +445,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3

"prfm pldl1keep, [%1, #128] \n"
"ld1 {v0.8h}, [%1], #16 \n" // r0
"ld1 {v1.h}[0], [%1] \n"
"ldr q0, [%1], #16 \n" // r0
"ldr h1, [%1] \n"

"fmla v28.8h, %8.8h, v0.h[0] \n"
"fmla v29.8h, %8.8h, v0.h[2] \n"
Expand All @@ -464,8 +464,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %10.8h, v1.h[0] \n"

"prfm pldl1keep, [%2, #128] \n"
"ld1 {v2.8h}, [%2], #16 \n" // r1
"ld1 {v3.h}[0], [%2] \n"
"ldr q2, [%2], #16 \n" // r1
"ldr h3, [%2] \n"

"fmla v28.8h, %11.8h, v2.h[0] \n"
"fmla v29.8h, %11.8h, v2.h[2] \n"
Expand All @@ -483,8 +483,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %13.8h, v3.h[0] \n"

"prfm pldl1keep, [%3, #128] \n"
"ld1 {v4.8h}, [%3], #16 \n" // r2
"ld1 {v5.h}[0], [%3] \n"
"ldr q4, [%3], #16 \n" // r2
"ldr h5, [%3] \n"

"fmla v28.8h, %14.8h, v4.h[0] \n"
"fmla v29.8h, %14.8h, v4.h[2] \n"
Expand Down Expand Up @@ -529,8 +529,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 sum1

"prfm pldl1keep, [%1, #64] \n"
"ld1 {v0.4h}, [%1], #8 \n" // r0
"ld1 {v1.h}[0], [%1] \n"
"ldr d0, [%1], #8 \n" // r0
"ldr h1, [%1] \n"

"fmla v30.8h, %8.8h, v0.h[0] \n"
"fmla v31.8h, %8.8h, v0.h[2] \n"
Expand All @@ -540,8 +540,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %10.8h, v1.h[0] \n"

"prfm pldl1keep, [%2, #64] \n"
"ld1 {v2.4h}, [%2], #8 \n" // r1
"ld1 {v3.h}[0], [%2] \n"
"ldr d2, [%2], #8 \n" // r1
"ldr h3, [%2] \n"

"fmla v30.8h, %11.8h, v2.h[0] \n"
"fmla v31.8h, %11.8h, v2.h[2] \n"
Expand All @@ -551,8 +551,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"fmla v31.8h, %13.8h, v3.h[0] \n"

"prfm pldl1keep, [%3, #64] \n"
"ld1 {v4.4h}, [%3], #8 \n" // r2
"ld1 {v5.h}[0], [%3] \n"
"ldr d4, [%3], #8 \n" // r2
"ldr h5, [%3] \n"

"fmla v30.8h, %14.8h, v4.h[0] \n"
"fmla v31.8h, %14.8h, v4.h[2] \n"
Expand Down Expand Up @@ -586,24 +586,24 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
{
asm volatile(
"prfm pldl1keep, [%0, #128] \n"
"ld1 {v30.8h}, [%0] \n" // sum0
"ldr q30, [%0] \n" // sum0

"prfm pldl1keep, [%1, #64] \n"
"ld1 {v0.4h}, [%1] \n" // r0
"ldr d0, [%1] \n" // r0

"fmla v30.8h, %8.8h, v0.h[0] \n"
"fmla v30.8h, %9.8h, v0.h[1] \n"
"fmla v30.8h, %10.8h, v0.h[2] \n"

"prfm pldl1keep, [%2, #64] \n"
"ld1 {v1.4h}, [%2] \n" // r1
"ldr d1, [%2] \n" // r1

"fmla v30.8h, %11.8h, v1.h[0] \n"
"fmla v30.8h, %12.8h, v1.h[1] \n"
"fmla v30.8h, %13.8h, v1.h[2] \n"

"prfm pldl1keep, [%3, #64] \n"
"ld1 {v2.4h}, [%3] \n" // r2
"ldr d2, [%3] \n" // r2

"fmla v30.8h, %14.8h, v2.h[0] \n"
"fmla v30.8h, %15.8h, v2.h[1] \n"
Expand All @@ -613,7 +613,7 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
"add %2, %2, #4 \n"
"add %3, %3, #4 \n"

"st1 {v30.8h}, [%0], #16 \n"
"str q30, [%0], #16 \n"

: "=r"(outptr0), // %0
"=r"(r0), // %1
Expand Down

0 comments on commit 2ace409

Please sign in to comment.