-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Enable TuningSlowDivide64 on Barcelona/Bobcat/Bulldozer/Ryzen Families #91277
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesDespite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible. All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - now matches Intel cpu behaviour (and the x86-64/v2/3/4 levels). Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use old-fashioned CHECK-DAG checks for divl/divq pairs. Fixes #90985 Full diff: https://github.com/llvm/llvm-project/pull/91277.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e731947893de9..aaf1756e858208 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1341,6 +1341,7 @@ def ProcessorFeatures {
FeatureCMOV,
FeatureX86_64];
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
+ TuningSlowDivide64,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
@@ -1363,6 +1364,7 @@ def ProcessorFeatures {
list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
+ TuningSlowDivide64,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
@@ -1385,6 +1387,7 @@ def ProcessorFeatures {
TuningFastVectorShiftMasks,
TuningFastMOVBE,
TuningSBBDepBreaking,
+ TuningSlowDivide64,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1409,6 +1412,7 @@ def ProcessorFeatures {
FeatureLWP,
FeatureLAHFSAHF64];
list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
+ TuningSlowDivide64,
TuningFast11ByteNOP,
TuningFastScalarShiftMasks,
TuningBranchFusion,
@@ -1488,6 +1492,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVariablePerLaneShuffle,
TuningFastMOVBE,
+ TuningSlowDivide64,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 66d7082d9b7c55..0c46501e4b9717 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Check that 64-bit division is bypassed correctly.
; RUN: llc < %s -mtriple=x86_64-- -mattr=-idivq-to-divl | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
; RUN: llc < %s -mtriple=x86_64-- -mattr=+idivq-to-divl | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
@@ -13,17 +12,17 @@
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
; AMD
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
; Additional tests for 64-bit divide bypass
@@ -40,22 +39,8 @@ define i64 @sdiv_quotient(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: sdiv_quotient:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB0_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: cqto
-; SLOW-DIVQ-NEXT: idivq %rsi
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB0_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: idivq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = sdiv i64 %a, %b
ret i64 %result
}
@@ -92,23 +77,8 @@ define i64 @sdiv_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: sdiv_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB3_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: cqto
-; SLOW-DIVQ-NEXT: idivq %rsi
-; SLOW-DIVQ-NEXT: movq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB3_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: movl %edx, %eax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: idivq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = srem i64 %a, %b
ret i64 %result
}
@@ -147,25 +117,8 @@ define i64 @sdiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: sdiv_quotient_and_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB6_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: cqto
-; SLOW-DIVQ-NEXT: idivq %rsi
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB6_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: idivq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%resultdiv = sdiv i64 %a, %b
%resultrem = srem i64 %a, %b
%result = add i64 %resultdiv, %resultrem
@@ -213,22 +166,8 @@ define i64 @udiv_quotient(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: udiv_quotient:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB9_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divq %rsi
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB9_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: divq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = udiv i64 %a, %b
ret i64 %result
}
@@ -265,23 +204,8 @@ define i64 @udiv_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: udiv_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB12_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divq %rsi
-; SLOW-DIVQ-NEXT: movq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB12_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: movl %edx, %eax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: divq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%result = urem i64 %a, %b
ret i64 %result
}
@@ -320,25 +244,8 @@ define i64 @udiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; FAST-DIVQ-NEXT: retq
;
; SLOW-DIVQ-LABEL: udiv_quotient_and_remainder:
-; SLOW-DIVQ: # %bb.0:
-; SLOW-DIVQ-NEXT: movq %rdi, %rax
-; SLOW-DIVQ-NEXT: movq %rdi, %rcx
-; SLOW-DIVQ-NEXT: orq %rsi, %rcx
-; SLOW-DIVQ-NEXT: shrq $32, %rcx
-; SLOW-DIVQ-NEXT: je .LBB15_1
-; SLOW-DIVQ-NEXT: # %bb.2:
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divq %rsi
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
-; SLOW-DIVQ-NEXT: .LBB15_1:
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
-; SLOW-DIVQ-NEXT: xorl %edx, %edx
-; SLOW-DIVQ-NEXT: divl %esi
-; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx
-; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
-; SLOW-DIVQ-NEXT: addq %rdx, %rax
-; SLOW-DIVQ-NEXT: retq
+; SLOW-DIVQ-DAG: divq %rsi
+; SLOW-DIVQ-DAG: divl %esi
%resultdiv = udiv i64 %a, %b
%resultrem = urem i64 %a, %b
%result = add i64 %resultdiv, %resultrem
|
…amilies Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible. All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - now matches most Intel cpu behaviour (and the x86-64/v2/3/4 levels). Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use a old-fashing CHECK-DAG check for divl/divq pairs. Fixes llvm#90985
ping? @adibiagio @ganeshgit any objections? |
@RKSimon LGTM! |
Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible.
All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - this patch now matches Intel cpu behaviour (and the x86-64/v2/3/4 levels).
Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use old-fashioned CHECK-DAG checks for divl/divq pairs.
Fixes #90985