From 3aa41e016a7efc2cca1926700cfbd8ddc05e4274 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 10 Jun 2024 16:23:13 -0700 Subject: [PATCH 01/82] [X86,MC] Remove two getPrevNode Make it more feasible to replace the fragment reprsentation, which might yield a large peak RSS win. --- .../lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 472f34a4efdb47..30f22cd322fecf 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -406,16 +406,10 @@ static bool isRightAfterData(MCFragment *CurrentFragment, const std::pair &PrevInstPosition) { MCFragment *F = CurrentFragment; - // Empty data fragments may be created to prevent further data being - // added into the previous fragment, we need to skip them since they - // have no contents. - for (; isa_and_nonnull(F); F = F->getPrevNode()) - if (cast(F)->getContents().size() != 0) - break; - // Since data is always emitted into a DataFragment, our check strategy is // simple here. // - If the fragment is a DataFragment + // - If it's empty (section start or data after align), return false. // - If it's not the fragment where the previous instruction is, // returns true. // - If it's the fragment holding the previous instruction but its @@ -424,8 +418,9 @@ isRightAfterData(MCFragment *CurrentFragment, // - Otherwise returns false. // - If the fragment is not a DataFragment, returns false. if (auto *DF = dyn_cast_or_null(F)) - return DF != PrevInstPosition.first || - DF->getContents().size() != PrevInstPosition.second; + return DF->getContents().size() && + (DF != PrevInstPosition.first || + DF->getContents().size() != PrevInstPosition.second); return false; } @@ -526,7 +521,7 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, if (!CanPadInst) return; - if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) { + if (PendingBA && PendingBA->getNextNode() == OS.getCurrentFragment()) { // Macro fusion actually happens and there is no other fragment inserted // after the previous instruction. // From e2d539bbbab229c496f4d990993275819330b2a3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Jun 2024 16:38:24 -0700 Subject: [PATCH 02/82] [memprof] Fix comment typos (NFC) --- llvm/include/llvm/ProfileData/MemProf.h | 2 +- llvm/lib/ProfileData/MemProf.cpp | 2 +- llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 4 ++-- llvm/lib/Transforms/Instrumentation/MemProfiler.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 53ddfd19234108..a9f7d7c3fda885 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -988,7 +988,7 @@ computeFrameHistogram(llvm::MapVector> // // The radix tree allows us to reconstruct call stacks in the leaf-to-root // order as we scan the array from left ro right while following pointers to -// parents along the way +// parents along the way. // // For example, if we are decoding CallStackId 2, we start a forward traversal // at Index 7, noting the call stack length of 4 and obtaining f5 and f4. When diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 4ca868722ec4e3..6d784053f877d4 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -394,7 +394,7 @@ Expected readMemProfSchema(const unsigned char *&Buffer) { } Result.push_back(static_cast(Tag)); } - // Advace the buffer to one past the schema if we succeeded. + // Advance the buffer to one past the schema if we succeeded. Buffer = Ptr; return Result; } diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index b58b906465e569..1cbf9c957f5c2b 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -568,13 +568,13 @@ class CallsiteContextGraph { /// unioning their recorded alloc types. uint8_t computeAllocType(DenseSet &ContextIds); - /// Returns the alloction type of the intersection of the contexts of two + /// Returns the allocation type of the intersection of the contexts of two /// nodes (based on their provided context id sets), optimized for the case /// when Node1Ids is smaller than Node2Ids. uint8_t intersectAllocTypesImpl(const DenseSet &Node1Ids, const DenseSet &Node2Ids); - /// Returns the alloction type of the intersection of the contexts of two + /// Returns the allocation type of the intersection of the contexts of two /// nodes (based on their provided context id sets). uint8_t intersectAllocTypes(const DenseSet &Node1Ids, const DenseSet &Node2Ids); diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index aac57231ba2ed6..8a12fa19a3dedf 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -839,7 +839,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, }; // Now walk the instructions, looking up the associated profile data using - // dbug locations. + // debug locations. for (auto &BB : F) { for (auto &I : BB) { if (I.isDebugOrPseudoInst()) @@ -937,7 +937,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, // Add callsite metadata for the instruction's location list so that // it simpler later on to identify which part of the MIB contexts // are from this particular instruction (including during inlining, - // when the callsite metdata will be updated appropriately). + // when the callsite metadata will be updated appropriately). // FIXME: can this be changed to strip out the matching stack // context ids from the MIB contexts and not add any callsite // metadata here to save space? From 48aebd4cf88b3632e8c3ed6b976287c973628e14 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Tue, 11 Jun 2024 00:51:52 +0100 Subject: [PATCH 03/82] Revert "[AArch64] Decouple feature dependency expansion. (#94279)" (#95056) This reverts commit 2cf14398c9341feddb419e7ff9c8c5623a3da3db since it broke the llvm test suite: SingleSource/UnitTests/AArch64/acle-fmv-features.c:59:9: error: instruction requires: altnzcv SingleSource/UnitTests/AArch64/acle-fmv-features.c:117:10: error: instruction requires: aes ... Looks like the FMV dependencies were used in the target attribute and now features that are FMVOnly (have AEK_NONE) cannot be expanded in parseTargetAttr using the ExtensionSet. This suggests that either the tests are wrong (they are using an FMVOnly feature in a target attribute), or that we need to turn the FMVOnly features into Extensions (these two are tablegen classes). --- clang/include/clang/AST/ASTContext.h | 3 + clang/lib/AST/ASTContext.cpp | 59 +++++----- clang/lib/AST/CMakeLists.txt | 2 - clang/lib/Basic/Targets/AArch64.cpp | 105 +++++++++++------ clang/lib/Basic/Targets/AArch64.h | 4 + .../CodeGen/aarch64-cpu-supports-target.c | 4 +- .../aarch64-sme-attrs.cpp | 2 +- clang/test/CodeGen/aarch64-targetattr.c | 48 ++++---- clang/test/CodeGen/attr-target-version.c | 46 ++++---- clang/test/Sema/aarch64-neon-target.c | 4 +- .../llvm/TargetParser/AArch64TargetParser.h | 107 +++++++----------- llvm/lib/TargetParser/AArch64TargetParser.cpp | 51 +++------ 12 files changed, 222 insertions(+), 213 deletions(-) diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 8bce4812f0d482..a1d1d1c51cd417 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -3203,6 +3203,9 @@ class ASTContext : public RefCountedBase { /// valid feature names. ParsedTargetAttr filterFunctionTargetAttrs(const TargetAttr *TD) const; + std::vector + filterFunctionTargetVersionAttrs(const TargetVersionAttr *TV) const; + void getFunctionFeatureMap(llvm::StringMap &FeatureMap, const FunctionDecl *) const; void getFunctionFeatureMap(llvm::StringMap &FeatureMap, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index cd76b8aa271dab..bf74e56a14799c 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -87,7 +87,6 @@ #include "llvm/Support/MD5.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/Triple.h" #include #include @@ -13664,20 +13663,17 @@ QualType ASTContext::getCorrespondingSignedFixedPointType(QualType Ty) const { } } -// Given a list of FMV features, return a concatenated list of the -// corresponding backend features (which may contain duplicates). -static std::vector getFMVBackendFeaturesFor( - const llvm::SmallVectorImpl &FMVFeatStrings) { - std::vector BackendFeats; - for (StringRef F : FMVFeatStrings) { - if (auto FMVExt = llvm::AArch64::parseArchExtension(F)) { - SmallVector Feats; - FMVExt->DependentFeatures.split(Feats, ',', -1, false); - for (StringRef F : Feats) - BackendFeats.push_back(F.str()); - } - } - return BackendFeats; +std::vector ASTContext::filterFunctionTargetVersionAttrs( + const TargetVersionAttr *TV) const { + assert(TV != nullptr); + llvm::SmallVector Feats; + std::vector ResFeats; + TV->getFeatures(Feats); + for (auto &Feature : Feats) + if (Target->validateCpuSupports(Feature.str())) + // Use '?' to mark features that came from TargetVersion. + ResFeats.push_back("?" + Feature.str()); + return ResFeats; } ParsedTargetAttr @@ -13712,12 +13708,10 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap &FeatureMap, // Make a copy of the features as passed on the command line into the // beginning of the additional features from the function to override. - // AArch64 handles command line option features in parseTargetAttr(). - if (!Target->getTriple().isAArch64()) - ParsedAttr.Features.insert( - ParsedAttr.Features.begin(), - Target->getTargetOpts().FeaturesAsWritten.begin(), - Target->getTargetOpts().FeaturesAsWritten.end()); + ParsedAttr.Features.insert( + ParsedAttr.Features.begin(), + Target->getTargetOpts().FeaturesAsWritten.begin(), + Target->getTargetOpts().FeaturesAsWritten.end()); if (ParsedAttr.CPU != "" && Target->isValidCPUName(ParsedAttr.CPU)) TargetCPU = ParsedAttr.CPU; @@ -13738,31 +13732,32 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap &FeatureMap, Target->getTargetOpts().FeaturesAsWritten.end()); Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features); } else if (const auto *TC = FD->getAttr()) { + std::vector Features; if (Target->getTriple().isAArch64()) { + // TargetClones for AArch64 llvm::SmallVector Feats; TC->getFeatures(Feats, GD.getMultiVersionIndex()); - std::vector Features = getFMVBackendFeaturesFor(Feats); + for (StringRef Feat : Feats) + if (Target->validateCpuSupports(Feat.str())) + // Use '?' to mark features that came from AArch64 TargetClones. + Features.push_back("?" + Feat.str()); Features.insert(Features.begin(), Target->getTargetOpts().FeaturesAsWritten.begin(), Target->getTargetOpts().FeaturesAsWritten.end()); - Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features); } else { - std::vector Features; StringRef VersionStr = TC->getFeatureStr(GD.getMultiVersionIndex()); if (VersionStr.starts_with("arch=")) TargetCPU = VersionStr.drop_front(sizeof("arch=") - 1); else if (VersionStr != "default") Features.push_back((StringRef{"+"} + VersionStr).str()); - Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features); } - } else if (const auto *TV = FD->getAttr()) { - llvm::SmallVector Feats; - TV->getFeatures(Feats); - std::vector Features = getFMVBackendFeaturesFor(Feats); - Features.insert(Features.begin(), - Target->getTargetOpts().FeaturesAsWritten.begin(), - Target->getTargetOpts().FeaturesAsWritten.end()); Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features); + } else if (const auto *TV = FD->getAttr()) { + std::vector Feats = filterFunctionTargetVersionAttrs(TV); + Feats.insert(Feats.begin(), + Target->getTargetOpts().FeaturesAsWritten.begin(), + Target->getTargetOpts().FeaturesAsWritten.end()); + Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Feats); } else { FeatureMap = Target->getTargetOpts().FeatureMap; } diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index 0328666d59b1fc..a5d3dacfc1a84e 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -139,6 +139,4 @@ add_clang_library(clangAST omp_gen ClangDriverOptions intrinsics_gen - # These generated headers are included transitively. - AArch64TargetParserTableGen ) diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 6fba5fff7bcc19..08d13c41a48572 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -1052,18 +1052,57 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, return true; } +bool AArch64TargetInfo::initFeatureMap( + llvm::StringMap &Features, DiagnosticsEngine &Diags, StringRef CPU, + const std::vector &FeaturesVec) const { + std::vector UpdatedFeaturesVec; + // Parse the CPU and add any implied features. + std::optional CpuInfo = llvm::AArch64::parseCpu(CPU); + if (CpuInfo) { + auto Exts = CpuInfo->getImpliedExtensions(); + std::vector CPUFeats; + llvm::AArch64::getExtensionFeatures(Exts, CPUFeats); + for (auto F : CPUFeats) { + assert((F[0] == '+' || F[0] == '-') && "Expected +/- in target feature!"); + UpdatedFeaturesVec.push_back(F.str()); + } + } + + // Process target and dependent features. This is done in two loops collecting + // them into UpdatedFeaturesVec: first to add dependent '+'features, second to + // add target '+/-'features that can later disable some of features added on + // the first loop. Function Multi Versioning features begin with '?'. + for (const auto &Feature : FeaturesVec) + if (((Feature[0] == '?' || Feature[0] == '+')) && + AArch64TargetInfo::doesFeatureAffectCodeGen(Feature.substr(1))) { + StringRef DepFeatures = + AArch64TargetInfo::getFeatureDependencies(Feature.substr(1)); + SmallVector AttrFeatures; + DepFeatures.split(AttrFeatures, ","); + for (auto F : AttrFeatures) + UpdatedFeaturesVec.push_back(F.str()); + } + for (const auto &Feature : FeaturesVec) + if (Feature[0] != '?') { + std::string UpdatedFeature = Feature; + if (Feature[0] == '+') { + std::optional Extension = + llvm::AArch64::parseArchExtension(Feature.substr(1)); + if (Extension) + UpdatedFeature = Extension->Feature.str(); + } + UpdatedFeaturesVec.push_back(UpdatedFeature); + } + + return TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec); +} + // Parse AArch64 Target attributes, which are a comma separated list of: // "arch=" - parsed to features as per -march=.. // "cpu=" - parsed to features as per -mcpu=.., with CPU set to // "tune=" - TuneCPU set to // "feature", "no-feature" - Add (or remove) feature. // "+feature", "+nofeature" - Add (or remove) feature. -// -// A feature may correspond to an Extension (anything with a corresponding -// AEK_), in which case an ExtensionSet is used to parse it and expand its -// dependencies. Otherwise the feature is passed through (e.g. +v8.1a, -// +outline-atomics, -fmv, etc). Features coming from the command line are -// already parsed, therefore their dependencies do not need expansion. ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const { ParsedTargetAttr Ret; if (Features == "default") @@ -1073,26 +1112,23 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const { bool FoundArch = false; auto SplitAndAddFeatures = [](StringRef FeatString, - std::vector &Features, - llvm::AArch64::ExtensionSet &FeatureBits) { + std::vector &Features) { SmallVector SplitFeatures; FeatString.split(SplitFeatures, StringRef("+"), -1, false); for (StringRef Feature : SplitFeatures) { - if (FeatureBits.parseModifier(Feature, /* AllowNoDashForm = */ true)) - continue; - // Pass through features that are not extensions, e.g. +v8.1a, - // +outline-atomics, -fmv, etc. - if (Feature.starts_with("no")) - Features.push_back("-" + Feature.drop_front(2).str()); + StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature); + if (!FeatureName.empty()) + Features.push_back(FeatureName.str()); else - Features.push_back("+" + Feature.str()); + // Pushing the original feature string to give a sema error later on + // when they get checked. + if (Feature.starts_with("no")) + Features.push_back("-" + Feature.drop_front(2).str()); + else + Features.push_back("+" + Feature.str()); } }; - llvm::AArch64::ExtensionSet FeatureBits; - // Reconstruct the bitset from the command line option features. - FeatureBits.reconstructFromParsedFeatures(getTargetOpts().FeaturesAsWritten); - for (auto &Feature : AttrFeatures) { Feature = Feature.trim(); if (Feature.starts_with("fpmath=")) @@ -1115,9 +1151,9 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const { // Ret.Features. if (!AI) continue; - FeatureBits.addArchDefaults(*AI); + Ret.Features.push_back(AI->ArchFeature.str()); // Add any extra features, after the + - SplitAndAddFeatures(Split.second, Ret.Features, FeatureBits); + SplitAndAddFeatures(Split.second, Ret.Features); } else if (Feature.starts_with("cpu=")) { if (!Ret.CPU.empty()) Ret.Duplicate = "cpu="; @@ -1127,10 +1163,7 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const { std::pair Split = Feature.split("=").second.trim().split("+"); Ret.CPU = Split.first; - if (auto CpuInfo = llvm::AArch64::parseCpu(Ret.CPU)) { - FeatureBits.addCPUDefaults(*CpuInfo); - SplitAndAddFeatures(Split.second, Ret.Features, FeatureBits); - } + SplitAndAddFeatures(Split.second, Ret.Features); } } else if (Feature.starts_with("tune=")) { if (!Ret.Tune.empty()) @@ -1138,19 +1171,25 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const { else Ret.Tune = Feature.split("=").second.trim(); } else if (Feature.starts_with("+")) { - SplitAndAddFeatures(Feature, Ret.Features, FeatureBits); + SplitAndAddFeatures(Feature, Ret.Features); + } else if (Feature.starts_with("no-")) { + StringRef FeatureName = + llvm::AArch64::getArchExtFeature(Feature.split("-").second); + if (!FeatureName.empty()) + Ret.Features.push_back("-" + FeatureName.drop_front(1).str()); + else + Ret.Features.push_back("-" + Feature.split("-").second.str()); } else { - if (FeatureBits.parseModifier(Feature, /* AllowNoDashForm = */ true)) - continue; - // Pass through features that are not extensions, e.g. +v8.1a, - // +outline-atomics, -fmv, etc. - if (Feature.starts_with("no-")) - Ret.Features.push_back("-" + Feature.drop_front(3).str()); + // Try parsing the string to the internal target feature name. If it is + // invalid, add the original string (which could already be an internal + // name). These should be checked later by isValidFeatureName. + StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature); + if (!FeatureName.empty()) + Ret.Features.push_back(FeatureName.str()); else Ret.Features.push_back("+" + Feature.str()); } } - FeatureBits.toLLVMFeatureList(Ret.Features); return Ret; } diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 696553ef8038a8..12fb50286f7511 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -107,6 +107,10 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { unsigned multiVersionSortPriority(StringRef Name) const override; unsigned multiVersionFeatureCost() const override; + bool + initFeatureMap(llvm::StringMap &Features, DiagnosticsEngine &Diags, + StringRef CPU, + const std::vector &FeaturesVec) const override; bool useFP16ConversionIntrinsics() const override { return false; } diff --git a/clang/test/CodeGen/aarch64-cpu-supports-target.c b/clang/test/CodeGen/aarch64-cpu-supports-target.c index 28187bcf745331..e023944b24e53a 100644 --- a/clang/test/CodeGen/aarch64-cpu-supports-target.c +++ b/clang/test/CodeGen/aarch64-cpu-supports-target.c @@ -48,5 +48,5 @@ int test_versions() { return code(); } // CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } -// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" } +// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon" } +// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp index 9885ac45e6a0e0..af8933d93d6cbb 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 \ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \ // RUN: -disable-O0-optnone -Werror -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg \ // RUN: | opt -S -passes=inline \ diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c index 644e6a692c3be9..3e7a2092456071 100644 --- a/clang/test/CodeGen/aarch64-targetattr.c +++ b/clang/test/CodeGen/aarch64-targetattr.c @@ -58,50 +58,58 @@ void v1msve() {} // CHECK-LABEL: @plussve() #12 __attribute__((target("+sve"))) void plussve() {} -// CHECK-LABEL: @plussveplussve2() #12 +// CHECK-LABEL: @plussveplussve2() #13 __attribute__((target("+sve+nosve2"))) void plussveplussve2() {} -// CHECK-LABEL: @plussveminusnosve2() #12 +// CHECK-LABEL: @plussveminusnosve2() #13 __attribute__((target("sve,no-sve2"))) void plussveminusnosve2() {} -// CHECK-LABEL: @plusfp16() #13 +// CHECK-LABEL: @plusfp16() #14 __attribute__((target("+fp16"))) void plusfp16() {} -// CHECK-LABEL: @all() #14 +// CHECK-LABEL: @all() #15 __attribute__((target("cpu=neoverse-n1,tune=cortex-a710,arch=armv8.6-a+sve2"))) void all() {} -// CHECK-LABEL: @allplusbranchprotection() #15 +// CHECK-LABEL: @allplusbranchprotection() #16 __attribute__((target("cpu=neoverse-n1,tune=cortex-a710,arch=armv8.6-a+sve2,branch-protection=standard"))) void allplusbranchprotection() {} -// CHECK-LABEL: @plusnosimd() #16 +// These tests check that the user facing and internal llvm name are both accepted. +// CHECK-LABEL: @plusnoneon() #17 +__attribute__((target("+noneon"))) +void plusnoneon() {} +// CHECK-LABEL: @plusnosimd() #17 __attribute__((target("+nosimd"))) void plusnosimd() {} -// CHECK-LABEL: @nosimd() #16 +// CHECK-LABEL: @noneon() #17 +__attribute__((target("no-neon"))) +void noneon() {} +// CHECK-LABEL: @nosimd() #17 __attribute__((target("no-simd"))) void nosimd() {} // This isn't part of the standard interface, but test that -arch features should not apply anything else. -// CHECK-LABEL: @minusarch() #17 +// CHECK-LABEL: @minusarch() #18 __attribute__((target("no-v9.3a"))) void minusarch() {} // CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } // CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } -// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } -// CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" } +// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } +// CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm" } // CHECK: attributes #5 = { {{.*}} "tune-cpu"="cortex-a710" } // CHECK: attributes #6 = { {{.*}} "target-cpu"="generic" } // CHECK: attributes #7 = { {{.*}} "tune-cpu"="generic" } -// CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.1a,+v8.2a,+v8a" "tune-cpu"="cortex-a710" } -// CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+sve" "tune-cpu"="cortex-a710" } -// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" } -// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,-sve" } -// CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+sve" } -// CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16" } -// CHECK: attributes #14 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } -// CHECK: attributes #15 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } -// CHECK-NOT: attributes #16 = {{.*}} "target-features" -// CHECK: attributes #17 = { {{.*}} "target-features"="-v9.3a" } +// CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs" "tune-cpu"="cortex-a710" } +// CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" "tune-cpu"="cortex-a710" } +// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" } +// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" } +// CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } +// CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-sve2" } +// CHECK: attributes #14 = { {{.*}} "target-features"="+fullfp16" } +// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } +// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" } +// CHECK: attributes #17 = { {{.*}} "target-features"="-neon" } +// CHECK: attributes #18 = { {{.*}} "target-features"="-v9.3a" } diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c index 75f8734e5aaf37..3597711333d341 100644 --- a/clang/test/CodeGen/attr-target-version.c +++ b/clang/test/CodeGen/attr-target-version.c @@ -1129,42 +1129,42 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NOFMV-NEXT: ret i32 0 // //. -// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+neon,+rand,-v9.5a" } +// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp16fml,+fullfp16,+neon,+rand,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+sha2,-v9.5a" } -// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+ls64,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+sha2,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+ls64,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp16fml,+fullfp16,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } -// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } -// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme,-v9.5a" } -// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-v9.5a" } -// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-v9.5a" } -// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-v9.5a" } +// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4,-v9.5a" } -// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm,-v9.5a" } +// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" } //. // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c index 642afddd88c154..fa45fff1d183d6 100644 --- a/clang/test/Sema/aarch64-neon-target.c +++ b/clang/test/Sema/aarch64-neon-target.c @@ -69,8 +69,8 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'fullfp16'}} vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'fullfp16'}} vrndi_f16(v4f16); // expected-error {{always_inline function 'vrndi_f16' requires target feature 'fullfp16'}} - // fp16fml depends on fp-armv8 - vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp-armv8'}} + // fp16fml + vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp16fml'}} // i8mm vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'i8mm'}} vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); // expected-error {{always_inline function 'vusdot_s32' requires target feature 'i8mm'}} diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index dcb00f6b0249fb..b7c1216d16e5ea 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -132,6 +132,48 @@ struct ExtensionInfo { #define EMIT_EXTENSIONS #include "llvm/TargetParser/AArch64TargetParserDef.inc" +struct ExtensionSet { + // Set of extensions which are currently enabled. + ExtensionBitset Enabled; + // Set of extensions which have been enabled or disabled at any point. Used + // to avoid cluttering the cc1 command-line with lots of unneeded features. + ExtensionBitset Touched; + // Base architecture version, which we need to know because some feature + // dependencies change depending on this. + const ArchInfo *BaseArch; + + ExtensionSet() : Enabled(), Touched(), BaseArch(nullptr) {} + + // Enable the given architecture extension, and any other extensions it + // depends on. Does not change the base architecture, or follow dependencies + // between features which are only related by required arcitecture versions. + void enable(ArchExtKind E); + + // Disable the given architecture extension, and any other extensions which + // depend on it. Does not change the base architecture, or follow + // dependencies between features which are only related by required + // arcitecture versions. + void disable(ArchExtKind E); + + // Add default extensions for the given CPU. Records the base architecture, + // to later resolve dependencies which depend on it. + void addCPUDefaults(const CpuInfo &CPU); + + // Add default extensions for the given architecture version. Records the + // base architecture, to later resolve dependencies which depend on it. + void addArchDefaults(const ArchInfo &Arch); + + // Add or remove a feature based on a modifier string. The string must be of + // the form "" to enable a feature or "no" to disable it. This + // will also enable or disable any features as required by the dependencies + // between them. + bool parseModifier(StringRef Modifier); + + // Convert the set of enabled extension to an LLVM feature list, appending + // them to Features. + void toLLVMFeatureList(std::vector &Features) const; +}; + // Represents a dependency between two architecture extensions. Later is the // feature which was added to the architecture after Earlier, and expands the // functionality provided by it. If Later is enabled, then Earlier will also be @@ -542,65 +584,6 @@ inline constexpr CpuInfo CpuInfos[] = { AArch64::AEK_PROFILE}))}, }; -struct ExtensionSet { - // Set of extensions which are currently enabled. - ExtensionBitset Enabled; - // Set of extensions which have been enabled or disabled at any point. Used - // to avoid cluttering the cc1 command-line with lots of unneeded features. - ExtensionBitset Touched; - // Base architecture version, which we need to know because some feature - // dependencies change depending on this. - const ArchInfo *BaseArch; - - ExtensionSet() : Enabled(), Touched(), BaseArch(nullptr) {} - - // Enable the given architecture extension, and any other extensions it - // depends on. Does not change the base architecture, or follow dependencies - // between features which are only related by required arcitecture versions. - void enable(ArchExtKind E); - - // Disable the given architecture extension, and any other extensions which - // depend on it. Does not change the base architecture, or follow - // dependencies between features which are only related by required - // arcitecture versions. - void disable(ArchExtKind E); - - // Add default extensions for the given CPU. Records the base architecture, - // to later resolve dependencies which depend on it. - void addCPUDefaults(const CpuInfo &CPU); - - // Add default extensions for the given architecture version. Records the - // base architecture, to later resolve dependencies which depend on it. - void addArchDefaults(const ArchInfo &Arch); - - // Add or remove a feature based on a modifier string. The string must be of - // the form "" to enable a feature or "no" to disable it. This - // will also enable or disable any features as required by the dependencies - // between them. - bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false); - - // Constructs a new ExtensionSet by toggling the corresponding bits for every - // feature in the \p Features list without expanding their dependencies. Used - // for reconstructing an ExtensionSet from the output of toLLVMFeatures(). - void reconstructFromParsedFeatures(const std::vector &Features); - - // Convert the set of enabled extension to an LLVM feature list, appending - // them to Features. - template void toLLVMFeatureList(std::vector &Features) const { - if (BaseArch && !BaseArch->ArchFeature.empty()) - Features.emplace_back(T(BaseArch->ArchFeature)); - - for (const auto &E : Extensions) { - if (E.Feature.empty() || !Touched.test(E.ID)) - continue; - if (Enabled.test(E.ID)) - Features.emplace_back(T(E.Feature)); - else - Features.emplace_back(T(E.NegFeature)); - } - } -}; - // Name alias. struct Alias { StringRef AltName; @@ -624,13 +607,7 @@ const ArchInfo *getArchForCpu(StringRef CPU); // Parser const ArchInfo *parseArch(StringRef Arch); - -// Return the extension which has the given -target-feature name. -std::optional targetFeatureToExtension(StringRef TargetFeature); - -// Parse a name as defined by the Extension class in tablegen. std::optional parseArchExtension(StringRef Extension); - // Given the name of a CPU or alias, return the correponding CpuInfo. std::optional parseCpu(StringRef Name); // Used by target parser tests diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index d1cc306790522d..ca356ec82bf1f9 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -122,14 +122,6 @@ AArch64::parseArchExtension(StringRef ArchExt) { return {}; } -std::optional -AArch64::targetFeatureToExtension(StringRef TargetFeature) { - for (const auto &E : Extensions) - if (TargetFeature == E.Feature) - return E; - return {}; -} - std::optional AArch64::parseCpu(StringRef Name) { // Resolve aliases first. Name = resolveCPUAlias(Name); @@ -221,6 +213,21 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) { disable(Dep.Later); } +void AArch64::ExtensionSet::toLLVMFeatureList( + std::vector &Features) const { + if (BaseArch && !BaseArch->ArchFeature.empty()) + Features.push_back(BaseArch->ArchFeature); + + for (const auto &E : Extensions) { + if (E.Feature.empty() || !Touched.test(E.ID)) + continue; + if (Enabled.test(E.ID)) + Features.push_back(E.Feature); + else + Features.push_back(E.NegFeature); + } +} + void AArch64::ExtensionSet::addCPUDefaults(const CpuInfo &CPU) { LLVM_DEBUG(llvm::dbgs() << "addCPUDefaults(" << CPU.Name << ")\n"); BaseArch = &CPU.Arch; @@ -240,18 +247,11 @@ void AArch64::ExtensionSet::addArchDefaults(const ArchInfo &Arch) { enable(E.ID); } -bool AArch64::ExtensionSet::parseModifier(StringRef Modifier, - const bool AllowNoDashForm) { +bool AArch64::ExtensionSet::parseModifier(StringRef Modifier) { LLVM_DEBUG(llvm::dbgs() << "parseModifier(" << Modifier << ")\n"); - size_t NChars = 0; - // The "no-feat" form is allowed in the target attribute but nowhere else. - if (AllowNoDashForm && Modifier.starts_with("no-")) - NChars = 3; - else if (Modifier.starts_with("no")) - NChars = 2; - bool IsNegated = NChars != 0; - StringRef ArchExt = Modifier.drop_front(NChars); + bool IsNegated = Modifier.starts_with("no"); + StringRef ArchExt = IsNegated ? Modifier.drop_front(2) : Modifier; if (auto AE = parseArchExtension(ArchExt)) { if (AE->Feature.empty() || AE->NegFeature.empty()) @@ -265,21 +265,6 @@ bool AArch64::ExtensionSet::parseModifier(StringRef Modifier, return false; } -void AArch64::ExtensionSet::reconstructFromParsedFeatures( - const std::vector &Features) { - assert(Touched.none() && "Bitset already initialized"); - for (auto &F : Features) { - bool IsNegated = F[0] == '-'; - if (auto AE = targetFeatureToExtension(F)) { - Touched.set(AE->ID); - if (IsNegated) - Enabled.reset(AE->ID); - else - Enabled.set(AE->ID); - } - } -} - const AArch64::ExtensionInfo & AArch64::getExtensionByID(AArch64::ArchExtKind ExtID) { return lookupExtensionByID(ExtID); From 46e41c8631bd6c1a6c91d6cc4a5e4f1671078ccd Mon Sep 17 00:00:00 2001 From: Will Dietz Date: Mon, 10 Jun 2024 19:12:34 -0500 Subject: [PATCH 04/82] [mlir] Sanitize identifiers with leading symbol. (#94795) Presently, if name starts with a symbol it's converted to hex which may cause the result to be invalid by starting with a digit. Address this and add a small test. Co-authored-by: Will Dietz --- mlir/lib/IR/AsmPrinter.cpp | 10 +++++++--- mlir/test/IR/print-attr-type-aliases.mlir | 3 +++ mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp | 1 + 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index 6a362afc52f259..2c43a6f15aa83f 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -999,9 +999,13 @@ static StringRef sanitizeIdentifier(StringRef name, SmallString<16> &buffer, bool allowTrailingDigit = true) { assert(!name.empty() && "Shouldn't have an empty name here"); + auto validChar = [&](char ch) { + return llvm::isAlnum(ch) || allowedPunctChars.contains(ch); + }; + auto copyNameToBuffer = [&] { for (char ch : name) { - if (llvm::isAlnum(ch) || allowedPunctChars.contains(ch)) + if (validChar(ch)) buffer.push_back(ch); else if (ch == ' ') buffer.push_back('_'); @@ -1013,7 +1017,7 @@ static StringRef sanitizeIdentifier(StringRef name, SmallString<16> &buffer, // Check to see if this name is valid. If it starts with a digit, then it // could conflict with the autogenerated numeric ID's, so add an underscore // prefix to avoid problems. - if (isdigit(name[0])) { + if (isdigit(name[0]) || (!validChar(name[0]) && name[0] != ' ')) { buffer.push_back('_'); copyNameToBuffer(); return buffer; @@ -1029,7 +1033,7 @@ static StringRef sanitizeIdentifier(StringRef name, SmallString<16> &buffer, // Check to see that the name consists of only valid identifier characters. for (char ch : name) { - if (!llvm::isAlnum(ch) && !allowedPunctChars.contains(ch)) { + if (!validChar(ch)) { copyNameToBuffer(); return buffer; } diff --git a/mlir/test/IR/print-attr-type-aliases.mlir b/mlir/test/IR/print-attr-type-aliases.mlir index 162eacd0022832..27c5a75addbb59 100644 --- a/mlir/test/IR/print-attr-type-aliases.mlir +++ b/mlir/test/IR/print-attr-type-aliases.mlir @@ -11,6 +11,9 @@ // CHECK-DAG: #_0_test_alias = "alias_test:prefixed_digit" "test.op"() {alias_test = "alias_test:prefixed_digit"} : () -> () +// CHECK-DAG: #_25test = "alias_test:prefixed_symbol" +"test.op"() {alias_test = "alias_test:prefixed_symbol"} : () -> () + // CHECK-DAG: #test_alias_conflict0_ = "alias_test:sanitize_conflict_a" // CHECK-DAG: #test_alias_conflict0_1 = "alias_test:sanitize_conflict_b" "test.op"() {alias_test = ["alias_test:sanitize_conflict_a", "alias_test:sanitize_conflict_b"]} : () -> () diff --git a/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp b/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp index a3a8913d5964c6..64add8cef36986 100644 --- a/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp +++ b/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp @@ -188,6 +188,7 @@ struct TestOpAsmInterface : public OpAsmDialectInterface { .Case("alias_test:dot_in_name", StringRef("test.alias")) .Case("alias_test:trailing_digit", StringRef("test_alias0")) .Case("alias_test:prefixed_digit", StringRef("0_test_alias")) + .Case("alias_test:prefixed_symbol", StringRef("%test")) .Case("alias_test:sanitize_conflict_a", StringRef("test_alias_conflict0")) .Case("alias_test:sanitize_conflict_b", From 1737814e577f80917fc9a56d58f09a22665179d1 Mon Sep 17 00:00:00 2001 From: PiJoules <6019989+PiJoules@users.noreply.github.com> Date: Mon, 10 Jun 2024 17:22:58 -0700 Subject: [PATCH 05/82] [libc][stdlib] Add freelist class (#95041) This implements a traditional freelist to be used by the freelist allocator. It operates on spans of bytes which can be anything. The freelist allocator will store Blocks inside them. This is a part of #94270 to land in smaller patches. --- libc/src/stdlib/CMakeLists.txt | 10 ++ libc/src/stdlib/freelist.h | 198 +++++++++++++++++++++++++ libc/test/src/stdlib/CMakeLists.txt | 12 ++ libc/test/src/stdlib/freelist_test.cpp | 166 +++++++++++++++++++++ 4 files changed, 386 insertions(+) create mode 100644 libc/src/stdlib/freelist.h create mode 100644 libc/test/src/stdlib/freelist_test.cpp diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index afb2d6d91cba43..d4aa50a43d186d 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -392,6 +392,16 @@ else() libc.src.__support.CPP.span libc.src.__support.CPP.type_traits ) + add_header_library( + freelist + HDRS + freelist.h + DEPENDS + libc.src.__support.fixedvector + libc.src.__support.CPP.cstddef + libc.src.__support.CPP.array + libc.src.__support.CPP.span + ) add_entrypoint_external( malloc ) diff --git a/libc/src/stdlib/freelist.h b/libc/src/stdlib/freelist.h new file mode 100644 index 00000000000000..20b4977835bef8 --- /dev/null +++ b/libc/src/stdlib/freelist.h @@ -0,0 +1,198 @@ +//===-- Interface for freelist_malloc -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_FREELIST_H +#define LLVM_LIBC_SRC_STDLIB_FREELIST_H + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/cstddef.h" +#include "src/__support/CPP/span.h" +#include "src/__support/fixedvector.h" + +namespace LIBC_NAMESPACE { + +using cpp::span; + +/// Basic [freelist](https://en.wikipedia.org/wiki/Free_list) implementation +/// for an allocator. This implementation buckets by chunk size, with a list +/// of user-provided buckets. Each bucket is a linked list of storage chunks. +/// Because this freelist uses the added chunks themselves as list nodes, there +/// is a lower bound of `sizeof(FreeList.FreeListNode)` bytes for chunks which +/// can be added to this freelist. There is also an implicit bucket for +/// "everything else", for chunks which do not fit into a bucket. +/// +/// Each added chunk will be added to the smallest bucket under which it fits. +/// If it does not fit into any user-provided bucket, it will be added to the +/// default bucket. +/// +/// As an example, assume that the `FreeList` is configured with buckets of +/// sizes {64, 128, 256, and 512} bytes. The internal state may look like the +/// following: +/// +/// @code{.unparsed} +/// bucket[0] (64B) --> chunk[12B] --> chunk[42B] --> chunk[64B] --> NULL +/// bucket[1] (128B) --> chunk[65B] --> chunk[72B] --> NULL +/// bucket[2] (256B) --> NULL +/// bucket[3] (512B) --> chunk[312B] --> chunk[512B] --> chunk[416B] --> NULL +/// bucket[4] (implicit) --> chunk[1024B] --> chunk[513B] --> NULL +/// @endcode +/// +/// Note that added chunks should be aligned to a 4-byte boundary. +template class FreeList { +public: + // Remove copy/move ctors + FreeList(const FreeList &other) = delete; + FreeList(FreeList &&other) = delete; + FreeList &operator=(const FreeList &other) = delete; + FreeList &operator=(FreeList &&other) = delete; + + /// Adds a chunk to this freelist. + bool add_chunk(cpp::span chunk); + + /// Finds an eligible chunk for an allocation of size `size`. + /// + /// @note This returns the first allocation possible within a given bucket; + /// It does not currently optimize for finding the smallest chunk. + /// + /// @returns + /// * On success - A span representing the chunk. + /// * On failure (e.g. there were no chunks available for that allocation) - + /// A span with a size of 0. + cpp::span find_chunk(size_t size) const; + + /// Removes a chunk from this freelist. + bool remove_chunk(cpp::span chunk); + +private: + // For a given size, find which index into chunks_ the node should be written + // to. + size_t find_chunk_ptr_for_size(size_t size, bool non_null) const; + + struct FreeListNode { + FreeListNode *next; + size_t size; + }; + +public: + explicit FreeList(cpp::array sizes) + : chunks_(NUM_BUCKETS + 1, 0), sizes_(sizes.begin(), sizes.end()) {} + + FixedVector chunks_; + FixedVector sizes_; +}; + +template +bool FreeList::add_chunk(span chunk) { + // Check that the size is enough to actually store what we need + if (chunk.size() < sizeof(FreeListNode)) + return false; + + union { + FreeListNode *node; + cpp::byte *bytes; + } aliased; + + aliased.bytes = chunk.data(); + + unsigned short chunk_ptr = find_chunk_ptr_for_size(chunk.size(), false); + + // Add it to the correct list. + aliased.node->size = chunk.size(); + aliased.node->next = chunks_[chunk_ptr]; + chunks_[chunk_ptr] = aliased.node; + + return true; +} + +template +span FreeList::find_chunk(size_t size) const { + if (size == 0) + return span(); + + unsigned short chunk_ptr = find_chunk_ptr_for_size(size, true); + + // Check that there's data. This catches the case where we run off the + // end of the array + if (chunks_[chunk_ptr] == nullptr) + return span(); + + // Now iterate up the buckets, walking each list to find a good candidate + for (size_t i = chunk_ptr; i < chunks_.size(); i++) { + union { + FreeListNode *node; + cpp::byte *data; + } aliased; + aliased.node = chunks_[static_cast(i)]; + + while (aliased.node != nullptr) { + if (aliased.node->size >= size) + return span(aliased.data, aliased.node->size); + + aliased.node = aliased.node->next; + } + } + + // If we get here, we've checked every block in every bucket. There's + // nothing that can support this allocation. + return span(); +} + +template +bool FreeList::remove_chunk(span chunk) { + unsigned short chunk_ptr = find_chunk_ptr_for_size(chunk.size(), true); + + // Walk that list, finding the chunk. + union { + FreeListNode *node; + cpp::byte *data; + } aliased, aliased_next; + + // Check head first. + if (chunks_[chunk_ptr] == nullptr) + return false; + + aliased.node = chunks_[chunk_ptr]; + if (aliased.data == chunk.data()) { + chunks_[chunk_ptr] = aliased.node->next; + return true; + } + + // No? Walk the nodes. + aliased.node = chunks_[chunk_ptr]; + + while (aliased.node->next != nullptr) { + aliased_next.node = aliased.node->next; + if (aliased_next.data == chunk.data()) { + // Found it, remove this node out of the chain + aliased.node->next = aliased_next.node->next; + return true; + } + + aliased.node = aliased.node->next; + } + + return false; +} + +template +size_t FreeList::find_chunk_ptr_for_size(size_t size, + bool non_null) const { + size_t chunk_ptr = 0; + for (chunk_ptr = 0u; chunk_ptr < sizes_.size(); chunk_ptr++) { + if (sizes_[chunk_ptr] >= size && + (!non_null || chunks_[chunk_ptr] != nullptr)) { + break; + } + } + + return chunk_ptr; +} + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_STDLIB_FREELIST_H diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index f122cd56a60605..d3954f077a219f 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -67,6 +67,18 @@ add_libc_test( libc.src.string.memcpy ) +add_libc_test( + freelist_test + SUITE + libc-stdlib-tests + SRCS + freelist_test.cpp + DEPENDS + libc.src.stdlib.freelist + libc.src.__support.CPP.array + libc.src.__support.CPP.span +) + add_fp_unittest( strtod_test SUITE diff --git a/libc/test/src/stdlib/freelist_test.cpp b/libc/test/src/stdlib/freelist_test.cpp new file mode 100644 index 00000000000000..e25c74b47b8522 --- /dev/null +++ b/libc/test/src/stdlib/freelist_test.cpp @@ -0,0 +1,166 @@ +//===-- Unittests for a freelist --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/span.h" +#include "src/stdlib/freelist.h" +#include "test/UnitTest/Test.h" + +using LIBC_NAMESPACE::FreeList; +using LIBC_NAMESPACE::cpp::array; +using LIBC_NAMESPACE::cpp::byte; +using LIBC_NAMESPACE::cpp::span; + +static constexpr size_t SIZE = 8; +static constexpr array example_sizes = {64, 128, 256, 512, + 1024, 2048, 4096, 8192}; + +TEST(LlvmLibcFreeList, EmptyListHasNoMembers) { + FreeList list(example_sizes); + + auto item = list.find_chunk(4); + EXPECT_EQ(item.size(), static_cast(0)); + item = list.find_chunk(128); + EXPECT_EQ(item.size(), static_cast(0)); +} + +TEST(LlvmLibcFreeList, CanRetrieveAddedMember) { + FreeList list(example_sizes); + constexpr size_t N = 512; + + byte data[N] = {byte(0)}; + + bool ok = list.add_chunk(span(data, N)); + EXPECT_TRUE(ok); + + auto item = list.find_chunk(N); + EXPECT_EQ(item.size(), N); + EXPECT_EQ(item.data(), data); +} + +TEST(LlvmLibcFreeList, CanRetrieveAddedMemberForSmallerSize) { + FreeList list(example_sizes); + constexpr size_t N = 512; + + byte data[N] = {byte(0)}; + + ASSERT_TRUE(list.add_chunk(span(data, N))); + auto item = list.find_chunk(N / 2); + EXPECT_EQ(item.size(), N); + EXPECT_EQ(item.data(), data); +} + +TEST(LlvmLibcFreeList, CanRemoveItem) { + FreeList list(example_sizes); + constexpr size_t N = 512; + + byte data[N] = {byte(0)}; + + ASSERT_TRUE(list.add_chunk(span(data, N))); + EXPECT_TRUE(list.remove_chunk(span(data, N))); + + auto item = list.find_chunk(N); + EXPECT_EQ(item.size(), static_cast(0)); +} + +TEST(LlvmLibcFreeList, FindReturnsSmallestChunk) { + FreeList list(example_sizes); + constexpr size_t kN1 = 512; + constexpr size_t kN2 = 1024; + + byte data1[kN1] = {byte(0)}; + byte data2[kN2] = {byte(0)}; + + ASSERT_TRUE(list.add_chunk(span(data1, kN1))); + ASSERT_TRUE(list.add_chunk(span(data2, kN2))); + + auto chunk = list.find_chunk(kN1 / 2); + EXPECT_EQ(chunk.size(), kN1); + EXPECT_EQ(chunk.data(), data1); + + chunk = list.find_chunk(kN1); + EXPECT_EQ(chunk.size(), kN1); + EXPECT_EQ(chunk.data(), data1); + + chunk = list.find_chunk(kN1 + 1); + EXPECT_EQ(chunk.size(), kN2); + EXPECT_EQ(chunk.data(), data2); +} + +TEST(LlvmLibcFreeList, FindReturnsCorrectChunkInSameBucket) { + // If we have two values in the same bucket, ensure that the allocation will + // pick an appropriately sized one. + FreeList list(example_sizes); + constexpr size_t kN1 = 512; + constexpr size_t kN2 = 257; + + byte data1[kN1] = {byte(0)}; + byte data2[kN2] = {byte(0)}; + + // List should now be 257 -> 512 -> NULL + ASSERT_TRUE(list.add_chunk(span(data1, kN1))); + ASSERT_TRUE(list.add_chunk(span(data2, kN2))); + + auto chunk = list.find_chunk(kN2 + 1); + EXPECT_EQ(chunk.size(), kN1); +} + +TEST(LlvmLibcFreeList, FindCanMoveUpThroughBuckets) { + // Ensure that finding a chunk will move up through buckets if no appropriate + // chunks were found in a given bucket + FreeList list(example_sizes); + constexpr size_t kN1 = 257; + constexpr size_t kN2 = 513; + + byte data1[kN1] = {byte(0)}; + byte data2[kN2] = {byte(0)}; + + // List should now be: + // bkt[3] (257 bytes up to 512 bytes) -> 257 -> NULL + // bkt[4] (513 bytes up to 1024 bytes) -> 513 -> NULL + ASSERT_TRUE(list.add_chunk(span(data1, kN1))); + ASSERT_TRUE(list.add_chunk(span(data2, kN2))); + + // Request a 300 byte chunk. This should return the 513 byte one + auto chunk = list.find_chunk(kN1 + 1); + EXPECT_EQ(chunk.size(), kN2); +} + +TEST(LlvmLibcFreeList, RemoveUnknownChunkReturnsNotFound) { + FreeList list(example_sizes); + constexpr size_t N = 512; + + byte data[N] = {byte(0)}; + byte data2[N] = {byte(0)}; + + ASSERT_TRUE(list.add_chunk(span(data, N))); + EXPECT_FALSE(list.remove_chunk(span(data2, N))); +} + +TEST(LlvmLibcFreeList, CanStoreMultipleChunksPerBucket) { + FreeList list(example_sizes); + constexpr size_t N = 512; + + byte data1[N] = {byte(0)}; + byte data2[N] = {byte(0)}; + + ASSERT_TRUE(list.add_chunk(span(data1, N))); + ASSERT_TRUE(list.add_chunk(span(data2, N))); + + auto chunk1 = list.find_chunk(N); + ASSERT_TRUE(list.remove_chunk(chunk1)); + auto chunk2 = list.find_chunk(N); + ASSERT_TRUE(list.remove_chunk(chunk2)); + + // Ordering of the chunks doesn't matter + EXPECT_TRUE(chunk1.data() != chunk2.data()); + EXPECT_TRUE(chunk1.data() == data1 || chunk1.data() == data2); + EXPECT_TRUE(chunk2.data() == data1 || chunk2.data() == data2); +} From 189d4711915f4ce89b373f3cbcfe1f19c73becd9 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:46:26 -0400 Subject: [PATCH 06/82] [clang] Reland Add tanf16 builtin and support for tan constrained intrinsic (#94559) Relanding this PR now that https://github.com/llvm/llvm-project/pull/90503 has merged. with `FTAN` landing in [TargetLoweringBase.cpp:L1021](https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/TargetLoweringBase.cpp#L1020C23-L1021C63 ) There is now a llvm tan intrinsic 32\64\128 Expand case for all llvm backends. In LLVM, the `llvm.experimental.constrained.cos` and `llvm.experimental.constrained.sin` intrinsics are used for performing cosine and sine calculations with additional constraints on floating-point operations. This behavior is expected for all floating-point math intrinsics. This change adds these constraints for the `tan` intrinsic. - `Builtins.td` - replace TanF128 with F16F128MathTemplate - `CGBuiltin.cpp` - map existing tan builtins to `tan` and `constrained_tan` intrinsic - `ConstrainedOps.def` map tan and constrained_tan to an ISDOpcode. resolves #91421 --------- Co-authored-by: Farzon Lotfi --- clang/include/clang/Basic/Builtins.td | 6 +- clang/lib/CodeGen/CGBuiltin.cpp | 12 + clang/test/CodeGen/X86/math-builtins.c | 8 +- .../test/CodeGen/constrained-math-builtins.c | 13 + clang/test/CodeGen/math-libcalls.c | 12 +- clang/test/CodeGenOpenCL/builtins-f16.cl | 3 + llvm/docs/LangRef.rst | 36 ++ llvm/include/llvm/IR/ConstrainedOps.def | 1 + llvm/include/llvm/IR/Intrinsics.td | 4 + .../Target/AArch64/AArch64ISelLowering.cpp | 16 +- llvm/test/Assembler/fp-intrinsics-attr.ll | 8 + .../CodeGen/AArch64/fp-intrinsics-fp16.ll | 16 + llvm/test/CodeGen/AArch64/fp-intrinsics.ll | 31 ++ llvm/test/CodeGen/ARM/fp-intrinsics.ll | 16 + .../CodeGen/PowerPC/ctrloop-constrained-fp.ll | 50 +++ .../ppcf128-constrained-fp-intrinsics.ll | 45 +++ .../vector-constrained-fp-intrinsics.ll | 356 ++++++++++++++++++ .../CodeGen/RISCV/double-intrinsics-strict.ll | 60 +++ .../CodeGen/RISCV/float-intrinsics-strict.ll | 60 +++ .../vector-constrained-fp-intrinsics.ll | 322 ++++++++++++++++ llvm/test/CodeGen/X86/fp-intrinsics.ll | 53 +++ .../CodeGen/X86/fp-strict-libcalls-msvc32.ll | 18 + .../test/CodeGen/X86/fp128-libcalls-strict.ll | 41 ++ llvm/test/CodeGen/X86/fp80-strict-libcalls.ll | 26 ++ .../X86/vector-constrained-fp-intrinsics.ll | 233 ++++++++++++ llvm/test/Feature/fp-intrinsics.ll | 12 + 26 files changed, 1437 insertions(+), 21 deletions(-) diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 11982af3fa609b..7bef5fd7ad40f2 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -482,11 +482,11 @@ def SqrtF16F128 : Builtin, F16F128MathTemplate { let Prototype = "T(T)"; } -def TanF128 : Builtin { - let Spellings = ["__builtin_tanf128"]; +def TanF16F128 : Builtin, F16F128MathTemplate { + let Spellings = ["__builtin_tan"]; let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions]; - let Prototype = "__float128(__float128)"; + let Prototype = "T(T)"; } def TanhF128 : Builtin { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index c16b69ba875679..06e201fa71e6ff 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2923,6 +2923,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, SetSqrtFPAccuracy(Call); return RValue::get(Call); } + + case Builtin::BItan: + case Builtin::BItanf: + case Builtin::BItanl: + case Builtin::BI__builtin_tan: + case Builtin::BI__builtin_tanf: + case Builtin::BI__builtin_tanf16: + case Builtin::BI__builtin_tanl: + case Builtin::BI__builtin_tanf128: + return RValue::get(emitUnaryMaybeConstrainedFPBuiltin( + *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan)); + case Builtin::BItrunc: case Builtin::BItruncf: case Builtin::BItruncl: diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index 093239b4482609..1e0f129b986102 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -674,10 +674,10 @@ __builtin_sqrt(f); __builtin_sqrtf(f); __builtin_sqrtl(f); __builtin_ __builtin_tan(f); __builtin_tanf(f); __builtin_tanl(f); __builtin_tanf128(f); -// NO__ERRNO: declare double @tan(double noundef) [[READNONE]] -// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]] -// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]] -// NO__ERRNO: declare fp128 @tanf128(fp128 noundef) [[READNONE]] +// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare fp128 @llvm.tan.f128(fp128) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]] diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c index 2de832dd2b6cae..6cc3a10a1e7946 100644 --- a/clang/test/CodeGen/constrained-math-builtins.c +++ b/clang/test/CodeGen/constrained-math-builtins.c @@ -183,6 +183,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _ // CHECK: call x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK: call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") + __builtin_tan(f); __builtin_tanf(f); __builtin_tanl(f); __builtin_tanf128(f); + +// CHECK: call double @llvm.experimental.constrained.tan.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call float @llvm.experimental.constrained.tan.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") +// CHECK: call fp128 @llvm.experimental.constrained.tan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") + + __builtin_trunc(f); __builtin_truncf(f); __builtin_truncl(f); __builtin_truncf128(f); // CHECK: call double @llvm.experimental.constrained.trunc.f64(double %{{.*}}, metadata !"fpexcept.strict") @@ -315,6 +323,11 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _ // CHECK: declare x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80, metadata, metadata) // CHECK: declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) +// CHECK: declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) +// CHECK: declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) +// CHECK: declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata) +// CHECK: declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata) + // CHECK: declare double @llvm.experimental.constrained.trunc.f64(double, metadata) // CHECK: declare float @llvm.experimental.constrained.trunc.f32(float, metadata) // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index 29c312ba0ecac2..a249182692762d 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -662,15 +662,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { tan(f); tanf(f); tanl(f); -// NO__ERRNO: declare double @tan(double noundef) [[READNONE]] -// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]] -// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]] +// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]] -// HAS_MAYTRAP: declare double @tan(double noundef) [[NOT_READNONE]] -// HAS_MAYTRAP: declare float @tanf(float noundef) [[NOT_READNONE]] -// HAS_MAYTRAP: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare double @llvm.experimental.constrained.tan.f64( +// HAS_MAYTRAP: declare float @llvm.experimental.constrained.tan.f32( +// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.tan.f80( tanh(f); tanhf(f); tanhl(f); diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl index adf7cdde154f51..d7bffdad5c548f 100644 --- a/clang/test/CodeGenOpenCL/builtins-f16.cl +++ b/clang/test/CodeGenOpenCL/builtins-f16.cl @@ -66,6 +66,9 @@ void test_half_builtins(half h0, half h1, half h2, int i0) { // CHECK: call half @llvm.sqrt.f16(half %h0) res = __builtin_sqrtf16(h0); + // CHECK: call half @llvm.tan.f16(half %h0) + res = __builtin_tanf16(h0); + // CHECK: call half @llvm.trunc.f16(half %h0) res = __builtin_truncf16(h0); diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 9fb2c048a5c869..c11a6627d81d31 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -26244,6 +26244,42 @@ same values as the libm ``cos`` functions would, and handles error conditions in the same way. +'``llvm.experimental.constrained.tan``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.tan( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.tan``' intrinsic returns the tangent of the +first operand. + +Arguments: +"""""""""" + +The first argument and the return type are floating-point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. + +Semantics: +"""""""""" + +This function returns the tangent of the specified operand, returning the +same values as the libm ``tan`` functions would, and handles error +conditions in the same way. + + '``llvm.experimental.constrained.exp``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def index 41aa44de957f93..a7b37c5cb204da 100644 --- a/llvm/include/llvm/IR/ConstrainedOps.def +++ b/llvm/include/llvm/IR/ConstrainedOps.def @@ -95,6 +95,7 @@ DAG_FUNCTION(round, 1, 0, experimental_constrained_round, FROUND) DAG_FUNCTION(roundeven, 1, 0, experimental_constrained_roundeven, FROUNDEVEN) DAG_FUNCTION(sin, 1, 1, experimental_constrained_sin, FSIN) DAG_FUNCTION(sqrt, 1, 1, experimental_constrained_sqrt, FSQRT) +DAG_FUNCTION(tan, 1, 1, experimental_constrained_tan, FTAN) DAG_FUNCTION(trunc, 1, 0, experimental_constrained_trunc, FTRUNC) // This is definition for fmuladd intrinsic function, that is converted into diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 107442623ab7bd..4c506a6ace23ea 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1218,6 +1218,10 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_tan : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; def int_experimental_constrained_pow : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], [ LLVMMatchType<0>, LLVMMatchType<0>, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 87d737d7ffe61c..c4f819f5fcdd29 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -728,14 +728,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote); } - for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, - ISD::FCOS, ISD::FSIN, ISD::FSINCOS, - ISD::FTAN, ISD::FEXP, ISD::FEXP2, - ISD::FEXP10, ISD::FLOG, ISD::FLOG2, - ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, - ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, - ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG, - ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) { + for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI, + ISD::FCOS, ISD::FSIN, ISD::FSINCOS, + ISD::FTAN, ISD::FEXP, ISD::FEXP2, + ISD::FEXP10, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW, + ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN, + ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG, + ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) { setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::v4f16, Expand); setOperationAction(Op, MVT::v8f16, Expand); diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll index 6546d1a275c99f..613630e1a2b4d2 100644 --- a/llvm/test/Assembler/fp-intrinsics-attr.ll +++ b/llvm/test/Assembler/fp-intrinsics-attr.ll @@ -85,6 +85,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp { metadata !"round.dynamic", metadata !"fpexcept.strict") + %tan = call double @llvm.experimental.constrained.tan.f64( + double %a, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + %pow = call double @llvm.experimental.constrained.pow.f64( double %a, double %b, metadata !"round.dynamic", @@ -244,6 +249,9 @@ declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) ; CHECK: @llvm.experimental.constrained.cos.f64({{.*}}) #[[ATTR1]] +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) +; CHECK: @llvm.experimental.constrained.tan.f64({{.*}}) #[[ATTR1]] + declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) ; CHECK: @llvm.experimental.constrained.pow.f64({{.*}}) #[[ATTR1]] diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll index 48062c9a54b5d3..b09ed8d3eb764e 100644 --- a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll +++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll @@ -338,6 +338,21 @@ define half @cos_f16(half %x) #0 { ret half %val } +define half @tan_f16(half %x) #0 { +; CHECK-LABEL: tan_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: bl tanf +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %val = call half @llvm.experimental.constrained.tan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret half %val +} + define half @pow_f16(half %x, half %y) #0 { ; CHECK-LABEL: pow_f16: ; CHECK: // %bb.0: @@ -1147,6 +1162,7 @@ declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) declare half @llvm.experimental.constrained.powi.f16(half, i32, metadata, metadata) declare half @llvm.experimental.constrained.sin.f16(half, metadata, metadata) declare half @llvm.experimental.constrained.cos.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.tan.f16(half, metadata, metadata) declare half @llvm.experimental.constrained.pow.f16(half, half, metadata, metadata) declare half @llvm.experimental.constrained.log.f16(half, metadata, metadata) declare half @llvm.experimental.constrained.log10.f16(half, metadata, metadata) diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll index 685efbb7cad431..67d0b63f4076f6 100644 --- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll @@ -146,6 +146,13 @@ define float @cos_f32(float %x) #0 { ret float %val } +; CHECK-LABEL: tan_f32: +; CHECK: bl tanf +define float @tan_f32(float %x) #0 { + %val = call float @llvm.experimental.constrained.tan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + ; CHECK-LABEL: pow_f32: ; CHECK: bl powf define float @pow_f32(float %x, float %y) #0 { @@ -630,6 +637,13 @@ define double @cos_f64(double %x) #0 { ret double %val } +; CHECK-LABEL: tan_f64: +; CHECK: bl tan +define double @tan_f64(double %x) #0 { + %val = call double @llvm.experimental.constrained.tan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + ; CHECK-LABEL: pow_f64: ; CHECK: bl pow define double @pow_f64(double %x, double %y) #0 { @@ -1114,6 +1128,13 @@ define fp128 @cos_f128(fp128 %x) #0 { ret fp128 %val } +; CHECK-LABEL: tan_f128: +; CHECK: bl tanl +define fp128 @tan_f128(fp128 %x) #0 { + %val = call fp128 @llvm.experimental.constrained.tan.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret fp128 %val +} + ; CHECK-LABEL: pow_f128: ; CHECK: bl powl define fp128 @pow_f128(fp128 %x, fp128 %y) #0 { @@ -1491,6 +1512,13 @@ define <1 x double> @cos_v1f64(<1 x double> %x, <1 x double> %y) #0 { ret <1 x double> %val } +; CHECK-LABEL: tan_v1f64: +; CHECK: bl tan +define <1 x double> @tan_v1f64(<1 x double> %x, <1 x double> %y) #0 { + %val = call <1 x double> @llvm.experimental.constrained.tan.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret <1 x double> %val +} + ; CHECK-LABEL: pow_v1f64: ; CHECK: bl pow define <1 x double> @pow_v1f64(<1 x double> %x, <1 x double> %y) #0 { @@ -1555,6 +1583,7 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, metadata) declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata) @@ -1599,6 +1628,7 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadat declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata) declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata) @@ -1643,6 +1673,7 @@ declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata declare fp128 @llvm.experimental.constrained.powi.f128(fp128, i32, metadata, metadata) declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.cos.f128(fp128, metadata, metadata) +declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.pow.f128(fp128, fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.log.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.log10.f128(fp128, metadata, metadata) diff --git a/llvm/test/CodeGen/ARM/fp-intrinsics.ll b/llvm/test/CodeGen/ARM/fp-intrinsics.ll index 64b22a5cc71bcc..e286eb3226e46f 100644 --- a/llvm/test/CodeGen/ARM/fp-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/fp-intrinsics.ll @@ -139,6 +139,13 @@ define float @cos_f32(float %x) #0 { ret float %val } +; CHECK-LABEL: tan_f32: +; CHECK: bl tanf +define float @tan_f32(float %x) #0 { + %val = call float @llvm.experimental.constrained.tan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret float %val +} + ; CHECK-LABEL: pow_f32: ; CHECK: bl powf define float @pow_f32(float %x, float %y) #0 { @@ -596,6 +603,13 @@ define double @cos_f64(double %x) #0 { ret double %val } +; CHECK-LABEL: tan_f64: +; CHECK: bl tan +define double @tan_f64(double %x) #0 { + %val = call double @llvm.experimental.constrained.tan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 + ret double %val +} + ; CHECK-LABEL: pow_f64: ; CHECK: bl pow define double @pow_f64(double %x, double %y) #0 { @@ -1023,6 +1037,7 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, metadata) declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata) @@ -1056,6 +1071,7 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadat declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata) declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata) diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll index 50ebe0471dceac..402ecb763d5b33 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll @@ -83,5 +83,55 @@ exit: ret void } +; Check constrained ops converted to call +define void @testTan(ptr %cast) strictfp { +; CHECK-LABEL: testTan: +; CHECK: # %bb.0: # %root +; CHECK-NEXT: mflr 0 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: .cfi_offset r29, -24 +; CHECK-NEXT: .cfi_offset r30, -16 +; CHECK-NEXT: std 29, -24(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill +; CHECK-NEXT: stdu 1, -64(1) +; CHECK-NEXT: addi 30, 3, -8 +; CHECK-NEXT: li 29, 255 +; CHECK-NEXT: std 0, 80(1) +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB2_1: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lfdu 1, 8(30) +; CHECK-NEXT: bl tan +; CHECK-NEXT: nop +; CHECK-NEXT: addi 29, 29, -1 +; CHECK-NEXT: stfd 1, 0(30) +; CHECK-NEXT: cmpldi 29, 0 +; CHECK-NEXT: bc 12, 1, .LBB2_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: addi 1, 1, 64 +; CHECK-NEXT: ld 0, 16(1) +; CHECK-NEXT: ld 30, -16(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 29, -24(1) # 8-byte Folded Reload +; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: blr +root: + br label %for.body + +exit: + ret void + +for.body: + %i = phi i64 [ 0, %root ], [ %next, %for.body ] + %idx = getelementptr inbounds double, ptr %cast, i64 %i + %val = load double, ptr %idx + %tan = tail call nnan ninf nsz arcp double @llvm.experimental.constrained.tan.f64(double %val, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %tan, ptr %idx, align 8 + %next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %next, 255 + br i1 %cond, label %exit, label %for.body +} + declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll index 42972fe069df6a..76f3dea5b7751d 100644 --- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll @@ -2066,6 +2066,50 @@ entry: ret i1 %conv } +define ppc_fp128 @test_tan_ppc_fp128(ppc_fp128 %first) #0 { +; PC64LE-LABEL: test_tan_ppc_fp128: +; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: stdu 1, -32(1) +; PC64LE-NEXT: std 0, 48(1) +; PC64LE-NEXT: bl tanl +; PC64LE-NEXT: nop +; PC64LE-NEXT: addi 1, 1, 32 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 +; PC64LE-NEXT: blr +; +; PC64LE9-LABEL: test_tan_ppc_fp128: +; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: stdu 1, -32(1) +; PC64LE9-NEXT: std 0, 48(1) +; PC64LE9-NEXT: bl tanl +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: addi 1, 1, 32 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: blr +; +; PC64-LABEL: test_tan_ppc_fp128: +; PC64: # %bb.0: # %entry +; PC64-NEXT: mflr 0 +; PC64-NEXT: stdu 1, -112(1) +; PC64-NEXT: std 0, 128(1) +; PC64-NEXT: bl tanl +; PC64-NEXT: nop +; PC64-NEXT: addi 1, 1, 112 +; PC64-NEXT: ld 0, 16(1) +; PC64-NEXT: mtlr 0 +; PC64-NEXT: blr +entry: + %tan = call ppc_fp128 @llvm.experimental.constrained.tan.ppcf128( + ppc_fp128 %first, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #1 + ret ppc_fp128 %tan +} + attributes #0 = { nounwind strictfp } attributes #1 = { strictfp } @@ -2096,6 +2140,7 @@ declare ppc_fp128 @llvm.experimental.constrained.round.ppcf128(ppc_fp128, metada declare ppc_fp128 @llvm.experimental.constrained.sin.ppcf128(ppc_fp128, metadata, metadata) declare ppc_fp128 @llvm.experimental.constrained.sqrt.ppcf128(ppc_fp128, metadata, metadata) declare ppc_fp128 @llvm.experimental.constrained.fsub.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata) +declare ppc_fp128 @llvm.experimental.constrained.tan.ppcf128(ppc_fp128, metadata, metadata) declare ppc_fp128 @llvm.experimental.constrained.trunc.ppcf128(ppc_fp128, metadata) declare i64 @llvm.experimental.constrained.fptosi.i64.ppcf128(ppc_fp128, metadata) declare i32 @llvm.experimental.constrained.fptosi.i32.ppcf128(ppc_fp128, metadata) diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll index 9cabe0c17d849d..f217162782bfd9 100644 --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -8302,6 +8302,357 @@ entry: ret <4 x float> %result } +define <1 x float> @constrained_vector_tan_v1f32(<1 x float> %x) #0 { +; PC64LE-LABEL: constrained_vector_tan_v1f32: +; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: stdu 1, -32(1) +; PC64LE-NEXT: std 0, 48(1) +; PC64LE-NEXT: bl tanf +; PC64LE-NEXT: nop +; PC64LE-NEXT: addi 1, 1, 32 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 +; PC64LE-NEXT: blr +; +; PC64LE9-LABEL: constrained_vector_tan_v1f32: +; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: stdu 1, -32(1) +; PC64LE9-NEXT: std 0, 48(1) +; PC64LE9-NEXT: bl tanf +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: addi 1, 1, 32 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: blr +entry: + %tan = call <1 x float> @llvm.experimental.constrained.tan.v1f32( + <1 x float> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #1 + ret <1 x float> %tan +} + +define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 { +; PC64LE-LABEL: constrained_vector_tan_v2f64: +; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: stdu 1, -80(1) +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: std 0, 96(1) +; PC64LE-NEXT: stxvd2x 62, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: stxvd2x 63, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: vmr 31, 2 +; PC64LE-NEXT: xxlor 1, 63, 63 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxlor 62, 1, 1 +; PC64LE-NEXT: xxswapd 1, 63 +; PC64LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: xxmrghd 34, 62, 1 +; PC64LE-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: lxvd2x 62, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: addi 1, 1, 80 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 +; PC64LE-NEXT: blr +; +; PC64LE9-LABEL: constrained_vector_tan_v2f64: +; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: stdu 1, -64(1) +; PC64LE9-NEXT: std 0, 80(1) +; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: vmr 31, 2 +; PC64LE9-NEXT: xscpsgndp 1, 63, 63 +; PC64LE9-NEXT: stxv 62, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xscpsgndp 62, 1, 1 +; PC64LE9-NEXT: xxswapd 1, 63 +; PC64LE9-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 34, 62, 1 +; PC64LE9-NEXT: lxv 63, 48(1) # 16-byte Folded Reload +; PC64LE9-NEXT: lxv 62, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: addi 1, 1, 64 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: blr +entry: + %tan = call <2 x double> @llvm.experimental.constrained.tan.v2f64( + <2 x double> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #1 + ret <2 x double> %tan +} + +define <3 x float> @constrained_vector_tan_v3f32(<3 x float> %x) #0 { +; PC64LE-LABEL: constrained_vector_tan_v3f32: +; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: stdu 1, -80(1) +; PC64LE-NEXT: xxsldwi 0, 34, 34, 1 +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: std 0, 96(1) +; PC64LE-NEXT: stfd 30, 64(1) # 8-byte Folded Spill +; PC64LE-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE-NEXT: xscvspdpn 1, 0 +; PC64LE-NEXT: stxvd2x 63, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: vmr 31, 2 +; PC64LE-NEXT: bl tanf +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxswapd 0, 63 +; PC64LE-NEXT: fmr 31, 1 +; PC64LE-NEXT: xscvspdpn 1, 0 +; PC64LE-NEXT: bl tanf +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxsldwi 0, 63, 63, 3 +; PC64LE-NEXT: fmr 30, 1 +; PC64LE-NEXT: xscvspdpn 1, 0 +; PC64LE-NEXT: bl tanf +; PC64LE-NEXT: nop +; PC64LE-NEXT: xscvdpspn 0, 1 +; PC64LE-NEXT: xscvdpspn 1, 30 +; PC64LE-NEXT: addis 3, 2, .LCPI189_0@toc@ha +; PC64LE-NEXT: lfd 30, 64(1) # 8-byte Folded Reload +; PC64LE-NEXT: xscvdpspn 36, 31 +; PC64LE-NEXT: lfd 31, 72(1) # 8-byte Folded Reload +; PC64LE-NEXT: addi 3, 3, .LCPI189_0@toc@l +; PC64LE-NEXT: xxmrghw 34, 1, 0 +; PC64LE-NEXT: lxvd2x 0, 0, 3 +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: xxswapd 35, 0 +; PC64LE-NEXT: vperm 2, 4, 2, 3 +; PC64LE-NEXT: addi 1, 1, 80 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 +; PC64LE-NEXT: blr +; +; PC64LE9-LABEL: constrained_vector_tan_v3f32: +; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: stdu 1, -64(1) +; PC64LE9-NEXT: xxsldwi 0, 34, 34, 1 +; PC64LE9-NEXT: std 0, 80(1) +; PC64LE9-NEXT: stfd 30, 48(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: stfd 31, 56(1) # 8-byte Folded Spill +; PC64LE9-NEXT: vmr 31, 2 +; PC64LE9-NEXT: xscvspdpn 1, 0 +; PC64LE9-NEXT: bl tanf +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xxswapd 0, 63 +; PC64LE9-NEXT: fmr 31, 1 +; PC64LE9-NEXT: xscvspdpn 1, 0 +; PC64LE9-NEXT: bl tanf +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xxsldwi 0, 63, 63, 3 +; PC64LE9-NEXT: fmr 30, 1 +; PC64LE9-NEXT: xscvspdpn 1, 0 +; PC64LE9-NEXT: bl tanf +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xscvdpspn 0, 1 +; PC64LE9-NEXT: xscvdpspn 1, 30 +; PC64LE9-NEXT: addis 3, 2, .LCPI189_0@toc@ha +; PC64LE9-NEXT: xscvdpspn 34, 31 +; PC64LE9-NEXT: lxv 63, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: lfd 31, 56(1) # 8-byte Folded Reload +; PC64LE9-NEXT: addi 3, 3, .LCPI189_0@toc@l +; PC64LE9-NEXT: lfd 30, 48(1) # 8-byte Folded Reload +; PC64LE9-NEXT: xxmrghw 35, 1, 0 +; PC64LE9-NEXT: lxv 0, 0(3) +; PC64LE9-NEXT: xxperm 34, 35, 0 +; PC64LE9-NEXT: addi 1, 1, 64 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: blr +entry: + %tan = call <3 x float> @llvm.experimental.constrained.tan.v3f32( + <3 x float> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #1 + ret <3 x float> %tan +} + +define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 { +; PC64LE-LABEL: constrained_vector_tan_v3f64: +; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: stdu 1, -80(1) +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: std 0, 96(1) +; PC64LE-NEXT: stfd 30, 64(1) # 8-byte Folded Spill +; PC64LE-NEXT: fmr 30, 2 +; PC64LE-NEXT: stfd 31, 72(1) # 8-byte Folded Spill +; PC64LE-NEXT: fmr 31, 3 +; PC64LE-NEXT: stxvd2x 63, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxlor 63, 1, 1 +; PC64LE-NEXT: fmr 1, 30 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: xxmrghd 63, 1, 63 +; PC64LE-NEXT: fmr 1, 31 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: fmr 3, 1 +; PC64LE-NEXT: xxswapd 1, 63 +; PC64LE-NEXT: lfd 31, 72(1) # 8-byte Folded Reload +; PC64LE-NEXT: xxlor 2, 63, 63 +; PC64LE-NEXT: lfd 30, 64(1) # 8-byte Folded Reload +; PC64LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: addi 1, 1, 80 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 +; PC64LE-NEXT: blr +; +; PC64LE9-LABEL: constrained_vector_tan_v3f64: +; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: stdu 1, -64(1) +; PC64LE9-NEXT: std 0, 80(1) +; PC64LE9-NEXT: stfd 30, 48(1) # 8-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: stfd 31, 56(1) # 8-byte Folded Spill +; PC64LE9-NEXT: fmr 31, 3 +; PC64LE9-NEXT: fmr 30, 2 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xscpsgndp 63, 1, 1 +; PC64LE9-NEXT: fmr 1, 30 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 63, 1, 63 +; PC64LE9-NEXT: fmr 1, 31 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: fmr 3, 1 +; PC64LE9-NEXT: xxswapd 1, 63 +; PC64LE9-NEXT: xscpsgndp 2, 63, 63 +; PC64LE9-NEXT: lxv 63, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: lfd 31, 56(1) # 8-byte Folded Reload +; PC64LE9-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE9-NEXT: lfd 30, 48(1) # 8-byte Folded Reload +; PC64LE9-NEXT: addi 1, 1, 64 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: blr +entry: + %tan = call <3 x double> @llvm.experimental.constrained.tan.v3f64( + <3 x double> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #1 + ret <3 x double> %tan +} + +define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 { +; PC64LE-LABEL: constrained_vector_tan_v4f64: +; PC64LE: # %bb.0: # %entry +; PC64LE-NEXT: mflr 0 +; PC64LE-NEXT: stdu 1, -96(1) +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: std 0, 112(1) +; PC64LE-NEXT: stxvd2x 61, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: stxvd2x 62, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: vmr 30, 2 +; PC64LE-NEXT: li 3, 80 +; PC64LE-NEXT: xxlor 1, 62, 62 +; PC64LE-NEXT: stxvd2x 63, 1, 3 # 16-byte Folded Spill +; PC64LE-NEXT: vmr 31, 3 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxlor 61, 1, 1 +; PC64LE-NEXT: xxswapd 1, 62 +; PC64LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: xxmrghd 62, 61, 1 +; PC64LE-NEXT: xxlor 1, 63, 63 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: xxlor 61, 1, 1 +; PC64LE-NEXT: xxswapd 1, 63 +; PC64LE-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE-NEXT: bl tan +; PC64LE-NEXT: nop +; PC64LE-NEXT: li 3, 80 +; PC64LE-NEXT: vmr 2, 30 +; PC64LE-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE-NEXT: xxmrghd 35, 61, 1 +; PC64LE-NEXT: lxvd2x 63, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: li 3, 64 +; PC64LE-NEXT: lxvd2x 62, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: li 3, 48 +; PC64LE-NEXT: lxvd2x 61, 1, 3 # 16-byte Folded Reload +; PC64LE-NEXT: addi 1, 1, 96 +; PC64LE-NEXT: ld 0, 16(1) +; PC64LE-NEXT: mtlr 0 +; PC64LE-NEXT: blr +; +; PC64LE9-LABEL: constrained_vector_tan_v4f64: +; PC64LE9: # %bb.0: # %entry +; PC64LE9-NEXT: mflr 0 +; PC64LE9-NEXT: stdu 1, -80(1) +; PC64LE9-NEXT: std 0, 96(1) +; PC64LE9-NEXT: stxv 62, 48(1) # 16-byte Folded Spill +; PC64LE9-NEXT: vmr 30, 2 +; PC64LE9-NEXT: xscpsgndp 1, 62, 62 +; PC64LE9-NEXT: stxv 61, 32(1) # 16-byte Folded Spill +; PC64LE9-NEXT: stxv 63, 64(1) # 16-byte Folded Spill +; PC64LE9-NEXT: vmr 31, 3 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xscpsgndp 61, 1, 1 +; PC64LE9-NEXT: xxswapd 1, 62 +; PC64LE9-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 62, 61, 1 +; PC64LE9-NEXT: xscpsgndp 1, 63, 63 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: xscpsgndp 61, 1, 1 +; PC64LE9-NEXT: xxswapd 1, 63 +; PC64LE9-NEXT: # kill: def $f1 killed $f1 killed $vsl1 +; PC64LE9-NEXT: bl tan +; PC64LE9-NEXT: nop +; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1 +; PC64LE9-NEXT: xxmrghd 35, 61, 1 +; PC64LE9-NEXT: vmr 2, 30 +; PC64LE9-NEXT: lxv 63, 64(1) # 16-byte Folded Reload +; PC64LE9-NEXT: lxv 62, 48(1) # 16-byte Folded Reload +; PC64LE9-NEXT: lxv 61, 32(1) # 16-byte Folded Reload +; PC64LE9-NEXT: addi 1, 1, 80 +; PC64LE9-NEXT: ld 0, 16(1) +; PC64LE9-NEXT: mtlr 0 +; PC64LE9-NEXT: blr +entry: + %tan = call <4 x double> @llvm.experimental.constrained.tan.v4f64( + <4 x double> %x, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #1 + ret <4 x double> %tan +} + attributes #0 = { nounwind strictfp noimplicitfloat } attributes #1 = { strictfp } @@ -8316,6 +8667,7 @@ declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata) @@ -8361,6 +8713,7 @@ declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x f declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata) @@ -8414,6 +8767,8 @@ declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metada declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata) @@ -8470,6 +8825,7 @@ declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll b/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll index 38215860193eaf..4cb6191e7322e9 100644 --- a/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll +++ b/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll @@ -375,6 +375,66 @@ define double @sincos_f64(double %a) nounwind strictfp { ret double %3 } +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) + +define double @tan_f64(double %a) nounwind strictfp { +; RV32IFD-LABEL: tan_f64: +; RV32IFD: # %bb.0: +; RV32IFD-NEXT: addi sp, sp, -16 +; RV32IFD-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFD-NEXT: call tan +; RV32IFD-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFD-NEXT: addi sp, sp, 16 +; RV32IFD-NEXT: ret +; +; RV64IFD-LABEL: tan_f64: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call tan +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; +; RV32IZFINXZDINX-LABEL: tan_f64: +; RV32IZFINXZDINX: # %bb.0: +; RV32IZFINXZDINX-NEXT: addi sp, sp, -16 +; RV32IZFINXZDINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IZFINXZDINX-NEXT: call tan +; RV32IZFINXZDINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 +; RV32IZFINXZDINX-NEXT: ret +; +; RV64IZFINXZDINX-LABEL: tan_f64: +; RV64IZFINXZDINX: # %bb.0: +; RV64IZFINXZDINX-NEXT: addi sp, sp, -16 +; RV64IZFINXZDINX-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IZFINXZDINX-NEXT: call tan +; RV64IZFINXZDINX-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IZFINXZDINX-NEXT: addi sp, sp, 16 +; RV64IZFINXZDINX-NEXT: ret +; +; RV32I-LABEL: tan_f64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call tan +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: tan_f64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call tan +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret + %1 = call double @llvm.experimental.constrained.tan.f64(double %a, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp + ret double %1 +} + declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata) define double @pow_f64(double %a, double %b) nounwind strictfp { diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll b/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll index 626db1985bfc7e..e4be5074cb800a 100644 --- a/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll +++ b/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll @@ -354,6 +354,66 @@ define float @sincos_f32(float %a) nounwind strictfp { ret float %3 } +declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) + +define float @tan_f32(float %a) nounwind strictfp { +; RV32IF-LABEL: tan_f32: +; RV32IF: # %bb.0: +; RV32IF-NEXT: addi sp, sp, -16 +; RV32IF-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IF-NEXT: call tanf +; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IF-NEXT: addi sp, sp, 16 +; RV32IF-NEXT: ret +; +; RV64IF-LABEL: tan_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: call tanf +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret +; +; RV32IZFINX-LABEL: tan_f32: +; RV32IZFINX: # %bb.0: +; RV32IZFINX-NEXT: addi sp, sp, -16 +; RV32IZFINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IZFINX-NEXT: call tanf +; RV32IZFINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IZFINX-NEXT: addi sp, sp, 16 +; RV32IZFINX-NEXT: ret +; +; RV64IZFINX-LABEL: tan_f32: +; RV64IZFINX: # %bb.0: +; RV64IZFINX-NEXT: addi sp, sp, -16 +; RV64IZFINX-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IZFINX-NEXT: call tanf +; RV64IZFINX-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IZFINX-NEXT: addi sp, sp, 16 +; RV64IZFINX-NEXT: ret +; +; RV32I-LABEL: tan_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call tanf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV64I-LABEL: tan_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call tanf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret + %1 = call float @llvm.experimental.constrained.tan.f32(float %a, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp + ret float %1 +} + declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata) define float @pow_f32(float %a, float %b) nounwind strictfp { diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll index 9d77744f18ca1a..4a109ee96a3d3e 100644 --- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -6222,6 +6222,323 @@ entry: ret void } +define <1 x float> @constrained_vector_tan_v1f32() #0 { +; S390X-LABEL: constrained_vector_tan_v1f32: +; S390X: # %bb.0: # %entry +; S390X-NEXT: stmg %r14, %r15, 112(%r15) +; S390X-NEXT: .cfi_offset %r14, -48 +; S390X-NEXT: .cfi_offset %r15, -40 +; S390X-NEXT: aghi %r15, -160 +; S390X-NEXT: .cfi_def_cfa_offset 320 +; S390X-NEXT: larl %r1, .LCPI119_0 +; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: brasl %r14, tanf@PLT +; S390X-NEXT: lmg %r14, %r15, 272(%r15) +; S390X-NEXT: br %r14 +; +; SZ13-LABEL: constrained_vector_tan_v1f32: +; SZ13: # %bb.0: # %entry +; SZ13-NEXT: stmg %r14, %r15, 112(%r15) +; SZ13-NEXT: .cfi_offset %r14, -48 +; SZ13-NEXT: .cfi_offset %r15, -40 +; SZ13-NEXT: aghi %r15, -160 +; SZ13-NEXT: .cfi_def_cfa_offset 320 +; SZ13-NEXT: larl %r1, .LCPI119_0 +; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tanf@PLT +; SZ13-NEXT: # kill: def $f0s killed $f0s def $v0 +; SZ13-NEXT: vlr %v24, %v0 +; SZ13-NEXT: lmg %r14, %r15, 272(%r15) +; SZ13-NEXT: br %r14 +entry: + %tan = call <1 x float> @llvm.experimental.constrained.tan.v1f32( + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %tan +} + +define <2 x double> @constrained_vector_tan_v2f64() #0 { +; S390X-LABEL: constrained_vector_tan_v2f64: +; S390X: # %bb.0: # %entry +; S390X-NEXT: stmg %r14, %r15, 112(%r15) +; S390X-NEXT: .cfi_offset %r14, -48 +; S390X-NEXT: .cfi_offset %r15, -40 +; S390X-NEXT: aghi %r15, -168 +; S390X-NEXT: .cfi_def_cfa_offset 328 +; S390X-NEXT: std %f8, 160(%r15) # 8-byte Folded Spill +; S390X-NEXT: .cfi_offset %f8, -168 +; S390X-NEXT: larl %r1, .LCPI120_0 +; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: larl %r1, .LCPI120_1 +; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldr %f8, %f0 +; S390X-NEXT: ldr %f0, %f1 +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: ldr %f2, %f8 +; S390X-NEXT: ld %f8, 160(%r15) # 8-byte Folded Reload +; S390X-NEXT: lmg %r14, %r15, 280(%r15) +; S390X-NEXT: br %r14 +; +; SZ13-LABEL: constrained_vector_tan_v2f64: +; SZ13: # %bb.0: # %entry +; SZ13-NEXT: stmg %r14, %r15, 112(%r15) +; SZ13-NEXT: .cfi_offset %r14, -48 +; SZ13-NEXT: .cfi_offset %r15, -40 +; SZ13-NEXT: aghi %r15, -176 +; SZ13-NEXT: .cfi_def_cfa_offset 336 +; SZ13-NEXT: larl %r1, .LCPI120_0 +; SZ13-NEXT: ld %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: larl %r1, .LCPI120_1 +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: ld %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vmrhg %v24, %v0, %v1 +; SZ13-NEXT: lmg %r14, %r15, 288(%r15) +; SZ13-NEXT: br %r14 +entry: + %tan = call <2 x double> @llvm.experimental.constrained.tan.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %tan +} + +define <3 x float> @constrained_vector_tan_v3f32() #0 { +; S390X-LABEL: constrained_vector_tan_v3f32: +; S390X: # %bb.0: # %entry +; S390X-NEXT: stmg %r14, %r15, 112(%r15) +; S390X-NEXT: .cfi_offset %r14, -48 +; S390X-NEXT: .cfi_offset %r15, -40 +; S390X-NEXT: aghi %r15, -176 +; S390X-NEXT: .cfi_def_cfa_offset 336 +; S390X-NEXT: std %f8, 168(%r15) # 8-byte Folded Spill +; S390X-NEXT: std %f9, 160(%r15) # 8-byte Folded Spill +; S390X-NEXT: .cfi_offset %f8, -168 +; S390X-NEXT: .cfi_offset %f9, -176 +; S390X-NEXT: larl %r1, .LCPI121_0 +; S390X-NEXT: le %f0, 0(%r1) +; S390X-NEXT: brasl %r14, tanf@PLT +; S390X-NEXT: larl %r1, .LCPI121_1 +; S390X-NEXT: le %f1, 0(%r1) +; S390X-NEXT: ler %f8, %f0 +; S390X-NEXT: ler %f0, %f1 +; S390X-NEXT: brasl %r14, tanf@PLT +; S390X-NEXT: larl %r1, .LCPI121_2 +; S390X-NEXT: le %f1, 0(%r1) +; S390X-NEXT: ler %f9, %f0 +; S390X-NEXT: ler %f0, %f1 +; S390X-NEXT: brasl %r14, tanf@PLT +; S390X-NEXT: ler %f2, %f9 +; S390X-NEXT: ler %f4, %f8 +; S390X-NEXT: ld %f8, 168(%r15) # 8-byte Folded Reload +; S390X-NEXT: ld %f9, 160(%r15) # 8-byte Folded Reload +; S390X-NEXT: lmg %r14, %r15, 288(%r15) +; S390X-NEXT: br %r14 +; +; SZ13-LABEL: constrained_vector_tan_v3f32: +; SZ13: # %bb.0: # %entry +; SZ13-NEXT: stmg %r14, %r15, 112(%r15) +; SZ13-NEXT: .cfi_offset %r14, -48 +; SZ13-NEXT: .cfi_offset %r15, -40 +; SZ13-NEXT: aghi %r15, -192 +; SZ13-NEXT: .cfi_def_cfa_offset 352 +; SZ13-NEXT: larl %r1, .LCPI121_0 +; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tanf@PLT +; SZ13-NEXT: larl %r1, .LCPI121_1 +; SZ13-NEXT: # kill: def $f0s killed $f0s def $v0 +; SZ13-NEXT: vst %v0, 176(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tanf@PLT +; SZ13-NEXT: larl %r1, .LCPI121_2 +; SZ13-NEXT: # kill: def $f0s killed $f0s def $v0 +; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: lde %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tanf@PLT +; SZ13-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: # kill: def $f0s killed $f0s def $v0 +; SZ13-NEXT: vmrhf %v0, %v1, %v0 +; SZ13-NEXT: vl %v1, 176(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: vrepf %v1, %v1, 0 +; SZ13-NEXT: vmrhg %v24, %v0, %v1 +; SZ13-NEXT: lmg %r14, %r15, 304(%r15) +; SZ13-NEXT: br %r14 +entry: + %tan = call <3 x float> @llvm.experimental.constrained.tan.v3f32( + <3 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %tan +} + +define void @constrained_vector_tan_v3f64(ptr %a) #0 { +; S390X-LABEL: constrained_vector_tan_v3f64: +; S390X: # %bb.0: # %entry +; S390X-NEXT: stmg %r13, %r15, 104(%r15) +; S390X-NEXT: .cfi_offset %r13, -56 +; S390X-NEXT: .cfi_offset %r14, -48 +; S390X-NEXT: .cfi_offset %r15, -40 +; S390X-NEXT: aghi %r15, -184 +; S390X-NEXT: .cfi_def_cfa_offset 344 +; S390X-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; S390X-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill +; S390X-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill +; S390X-NEXT: .cfi_offset %f8, -168 +; S390X-NEXT: .cfi_offset %f9, -176 +; S390X-NEXT: .cfi_offset %f10, -184 +; S390X-NEXT: lgr %r13, %r2 +; S390X-NEXT: ld %f8, 0(%r2) +; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f9, 8(%r2) +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: ldr %f10, %f0 +; S390X-NEXT: ldr %f0, %f9 +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: ldr %f9, %f0 +; S390X-NEXT: ldr %f0, %f8 +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f9, 8(%r13) +; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload +; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload +; S390X-NEXT: lmg %r13, %r15, 288(%r15) +; S390X-NEXT: br %r14 +; +; SZ13-LABEL: constrained_vector_tan_v3f64: +; SZ13: # %bb.0: # %entry +; SZ13-NEXT: stmg %r13, %r15, 104(%r15) +; SZ13-NEXT: .cfi_offset %r13, -56 +; SZ13-NEXT: .cfi_offset %r14, -48 +; SZ13-NEXT: .cfi_offset %r15, -40 +; SZ13-NEXT: aghi %r15, -200 +; SZ13-NEXT: .cfi_def_cfa_offset 360 +; SZ13-NEXT: std %f8, 192(%r15) # 8-byte Folded Spill +; SZ13-NEXT: .cfi_offset %f8, -168 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: ld %f8, 16(%r2) +; SZ13-NEXT: lgr %r13, %r2 +; SZ13-NEXT: vst %v0, 176(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: vl %v0, 176(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: vrepg %v0, %v0, 1 +; SZ13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vmrhg %v0, %v1, %v0 +; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: ldr %f0, %f8 +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: std %f0, 16(%r13) +; SZ13-NEXT: vl %v0, 160(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: ld %f8, 192(%r15) # 8-byte Folded Reload +; SZ13-NEXT: vst %v0, 0(%r13), 4 +; SZ13-NEXT: lmg %r13, %r15, 304(%r15) +; SZ13-NEXT: br %r14 +entry: + %b = load <3 x double>, ptr %a + %tan = call <3 x double> @llvm.experimental.constrained.tan.v3f64( + <3 x double> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + store <3 x double> %tan, ptr %a + ret void +} + +define <4 x double> @constrained_vector_tan_v4f64() #0 { +; S390X-LABEL: constrained_vector_tan_v4f64: +; S390X: # %bb.0: # %entry +; S390X-NEXT: stmg %r14, %r15, 112(%r15) +; S390X-NEXT: .cfi_offset %r14, -48 +; S390X-NEXT: .cfi_offset %r15, -40 +; S390X-NEXT: aghi %r15, -184 +; S390X-NEXT: .cfi_def_cfa_offset 344 +; S390X-NEXT: std %f8, 176(%r15) # 8-byte Folded Spill +; S390X-NEXT: std %f9, 168(%r15) # 8-byte Folded Spill +; S390X-NEXT: std %f10, 160(%r15) # 8-byte Folded Spill +; S390X-NEXT: .cfi_offset %f8, -168 +; S390X-NEXT: .cfi_offset %f9, -176 +; S390X-NEXT: .cfi_offset %f10, -184 +; S390X-NEXT: larl %r1, .LCPI123_0 +; S390X-NEXT: ld %f0, 0(%r1) +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: larl %r1, .LCPI123_1 +; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldr %f8, %f0 +; S390X-NEXT: ldr %f0, %f1 +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: larl %r1, .LCPI123_2 +; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldr %f9, %f0 +; S390X-NEXT: ldr %f0, %f1 +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: larl %r1, .LCPI123_3 +; S390X-NEXT: ld %f1, 0(%r1) +; S390X-NEXT: ldr %f10, %f0 +; S390X-NEXT: ldr %f0, %f1 +; S390X-NEXT: brasl %r14, tan@PLT +; S390X-NEXT: ldr %f2, %f10 +; S390X-NEXT: ldr %f4, %f9 +; S390X-NEXT: ldr %f6, %f8 +; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload +; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload +; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload +; S390X-NEXT: lmg %r14, %r15, 296(%r15) +; S390X-NEXT: br %r14 +; +; SZ13-LABEL: constrained_vector_tan_v4f64: +; SZ13: # %bb.0: # %entry +; SZ13-NEXT: stmg %r14, %r15, 112(%r15) +; SZ13-NEXT: .cfi_offset %r14, -48 +; SZ13-NEXT: .cfi_offset %r15, -40 +; SZ13-NEXT: aghi %r15, -192 +; SZ13-NEXT: .cfi_def_cfa_offset 352 +; SZ13-NEXT: larl %r1, .LCPI123_0 +; SZ13-NEXT: ld %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: larl %r1, .LCPI123_1 +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: ld %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: vl %v1, 160(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vmrhg %v0, %v0, %v1 +; SZ13-NEXT: larl %r1, .LCPI123_2 +; SZ13-NEXT: vst %v0, 160(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: ld %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: larl %r1, .LCPI123_3 +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vst %v0, 176(%r15), 3 # 16-byte Folded Spill +; SZ13-NEXT: ld %f0, 0(%r1) +; SZ13-NEXT: brasl %r14, tan@PLT +; SZ13-NEXT: vl %v1, 176(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: vl %v24, 160(%r15), 3 # 16-byte Folded Reload +; SZ13-NEXT: # kill: def $f0d killed $f0d def $v0 +; SZ13-NEXT: vmrhg %v26, %v0, %v1 +; SZ13-NEXT: lmg %r14, %r15, 304(%r15) +; SZ13-NEXT: br %r14 +entry: + %tan = call <4 x double> @llvm.experimental.constrained.tan.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %tan +} + attributes #0 = { strictfp } declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) @@ -6234,6 +6551,7 @@ declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata) @@ -6260,6 +6578,7 @@ declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x f declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata) @@ -6296,6 +6615,8 @@ declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metada declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata) @@ -6335,6 +6656,7 @@ declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll index d2b45ee1e03e63..8c48e6f9da80a7 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -2758,6 +2758,58 @@ entry: ret float %result } +; Verify that tan(42.0) isn't simplified when the rounding mode is unknown. +define double @ftan() #0 { +; X87-LABEL: ftan: +; X87: # %bb.0: # %entry +; X87-NEXT: subl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 16 +; X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} +; X87-NEXT: fstpl (%esp) +; X87-NEXT: wait +; X87-NEXT: calll tan +; X87-NEXT: addl $12, %esp +; X87-NEXT: .cfi_def_cfa_offset 4 +; X87-NEXT: retl +; +; X86-SSE-LABEL: ftan: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: calll tan +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; SSE-LABEL: ftan: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rax +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; SSE-NEXT: callq tan@PLT +; SSE-NEXT: popq %rax +; SSE-NEXT: .cfi_def_cfa_offset 8 +; SSE-NEXT: retq +; +; AVX-LABEL: ftan: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %result = call double @llvm.experimental.constrained.tan.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + + attributes #0 = { strictfp } @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata" @@ -2771,6 +2823,7 @@ declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata) declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll index 1bc308bef8cccf..cfec52c0e68863 100644 --- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll +++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll @@ -160,6 +160,23 @@ define float @sin(float %x) #0 { ret float %result } +define float @tan(float %x) #0 { +; CHECK-LABEL: tan: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl (%esp) +; CHECK-NEXT: wait +; CHECK-NEXT: calll _tan +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: wait +; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: retl + %result = call float @llvm.experimental.constrained.tan.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret float %result +} + attributes #0 = { strictfp } declare float @llvm.experimental.constrained.ceil.f32(float, metadata) @@ -171,3 +188,4 @@ declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata) declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata) declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata) +declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index f1d473f81a9fa1..bd51f553587db7 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -1047,6 +1047,46 @@ entry: ret fp128 %sqrt } +define fp128 @tan(fp128 %x) nounwind strictfp { +; ANDROID-LABEL: tan: +; ANDROID: # %bb.0: # %entry +; ANDROID-NEXT: pushq %rax +; ANDROID-NEXT: callq tanl@PLT +; ANDROID-NEXT: popq %rax +; ANDROID-NEXT: retq +; +; GNU-LABEL: tan: +; GNU: # %bb.0: # %entry +; GNU-NEXT: pushq %rax +; GNU-NEXT: callq tanf128@PLT +; GNU-NEXT: popq %rax +; GNU-NEXT: retq +; +; X86-LABEL: tan: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: subl $24, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: subl $12, %esp +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %eax +; X86-NEXT: calll tanl +; X86-NEXT: addl $28, %esp +; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: movaps %xmm0, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $24, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl $4 +entry: + %tan = call fp128 @llvm.experimental.constrained.tan.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret fp128 %tan +} + define fp128 @trunc(fp128 %x) nounwind strictfp { ; ANDROID-LABEL: trunc: ; ANDROID: # %bb.0: # %entry @@ -1663,6 +1703,7 @@ declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata) declare fp128 @llvm.experimental.constrained.roundeven.f128(fp128, metadata) declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata) +declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata) declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata) declare i32 @llvm.experimental.constrained.lrint.i32.f128(fp128, metadata, metadata) declare i64 @llvm.experimental.constrained.llrint.i64.f128(fp128, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll index 4d50b15e5c185b..89729975cfd61b 100644 --- a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll @@ -504,6 +504,31 @@ entry: ret x86_fp80 %sin } +define x86_fp80 @tan(x86_fp80 %x) nounwind strictfp { +; X86-LABEL: tan: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fstpt (%esp) +; X86-NEXT: wait +; X86-NEXT: calll tanl +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +; +; X64-LABEL: tan: +; X64: # %bb.0: # %entry +; X64-NEXT: subq $24, %rsp +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fstpt (%rsp) +; X64-NEXT: wait +; X64-NEXT: callq tanl@PLT +; X64-NEXT: addq $24, %rsp +; X64-NEXT: retq +entry: + %tan = call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + ret x86_fp80 %tan +} + define x86_fp80 @trunc(x86_fp80 %x) nounwind strictfp { ; X86-LABEL: trunc: ; X86: # %bb.0: # %entry @@ -650,6 +675,7 @@ declare x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80, metadata, met declare x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80, metadata) declare x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80, metadata) declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata) +declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata) declare i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80, metadata, metadata) declare i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 0adb9ddfc426a8..d71fd470651cf9 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -7771,6 +7771,234 @@ define <16 x float> @vpaddd_mask_test(<16 x float> %i, <16 x float> %j, <16 x i3 %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %i ret <16 x float> %r } + +define <1 x float> @constrained_vector_tan_v1f32() #0 { +; CHECK-LABEL: constrained_vector_tan_v1f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_tan_v1f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq tanf@PLT +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %tan = call <1 x float> @llvm.experimental.constrained.tan.v1f32( + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <1 x float> %tan +} + +define <2 x double> @constrained_vector_tan_v2f64() #0 { +; CHECK-LABEL: constrained_vector_tan_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_tan_v2f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %tan = call <2 x double> @llvm.experimental.constrained.tan.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <2 x double> %tan +} + +define <3 x float> @constrained_vector_tan_v3f32() #0 { +; CHECK-LABEL: constrained_vector_tan_v3f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_tan_v3f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq tanf@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq tanf@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0] +; AVX-NEXT: callq tanf@PLT +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %tan = call <3 x float> @llvm.experimental.constrained.tan.v3f32( + <3 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x float> %tan +} + +define <3 x double> @constrained_vector_tan_v3f64() #0 { +; CHECK-LABEL: constrained_vector_tan_v3f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldl {{[0-9]+}}(%rsp) +; CHECK-NEXT: wait +; CHECK-NEXT: movsd (%rsp), %xmm0 # 8-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero +; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_tan_v3f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %tan = call <3 x double> @llvm.experimental.constrained.tan.v3f64( + <3 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <3 x double> %tan +} + +define <4 x double> @constrained_vector_tan_v4f64() #0 { +; CHECK-LABEL: constrained_vector_tan_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +; +; AVX-LABEL: constrained_vector_tan_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; AVX-NEXT: callq tan@PLT +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +entry: + %tan = call <4 x double> @llvm.experimental.constrained.tan.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <4 x double> %tan +} + + + declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata) attributes #0 = { strictfp } @@ -7786,6 +8014,7 @@ declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata) @@ -7829,6 +8058,7 @@ declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x f declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata) @@ -7882,6 +8112,8 @@ declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metada declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata) @@ -7938,6 +8170,7 @@ declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata) diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll index b92408a1bf1cd5..78275a16d3e8f7 100644 --- a/llvm/test/Feature/fp-intrinsics.ll +++ b/llvm/test/Feature/fp-intrinsics.ll @@ -151,6 +151,17 @@ entry: ret double %result } +; Verify that tan(42.0) isn't simplified when the rounding mode is unknown. +; CHECK-LABEL: ftan +; CHECK: call double @llvm.experimental.constrained.tan +define double @ftan() #0 { +entry: + %result = call double @llvm.experimental.constrained.tan.f64(double 42.0, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %result +} + ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown. ; CHECK-LABEL: f10 ; CHECK: call double @llvm.experimental.constrained.exp @@ -407,6 +418,7 @@ declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata) declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata) +declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata) declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata) From c4f8ae6f32dadf9383c94ed13152d91f68631255 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov <6532716+alexander-shaposhnikov@users.noreply.github.com> Date: Mon, 10 Jun 2024 17:53:22 -0700 Subject: [PATCH 07/82] [LLVM][IR][Sanitizers] Add sanitize_numerical_stability attribute (#95051) Add sanitize_numerical_stability attribute. --- llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 + llvm/include/llvm/IR/Attributes.td | 4 ++++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 ++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 ++ llvm/lib/Transforms/Utils/CodeExtractor.cpp | 1 + llvm/test/Bitcode/compatibility.ll | 7 +++++-- 6 files changed, 15 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 9999aee61528e5..39dcd209afdc69 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -754,6 +754,7 @@ enum AttributeKindCodes { ATTR_KIND_CORO_ONLY_DESTROY_WHEN_COMPLETE = 90, ATTR_KIND_DEAD_ON_UNWIND = 91, ATTR_KIND_RANGE = 92, + ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 93, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index cef8b17769f0d0..772c7579aec6d8 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -285,6 +285,9 @@ def SanitizeHWAddress : EnumAttr<"sanitize_hwaddress", [FnAttr]>; /// MemTagSanitizer is on. def SanitizeMemTag : EnumAttr<"sanitize_memtag", [FnAttr]>; +/// NumericalStabilitySanitizer is on. +def SanitizeNumericalStability : EnumAttr<"sanitize_numerical_stability", [FnAttr]>; + /// Speculative Load Hardening is enabled. /// /// Note that this uses the default compatibility (always compatible during @@ -372,6 +375,7 @@ def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; +def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; def : CompatRule<"isEqual">; diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 026595bdc63405..40852a6fd404b5 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2128,6 +2128,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::SanitizeThread; case bitc::ATTR_KIND_SANITIZE_MEMORY: return Attribute::SanitizeMemory; + case bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY: + return Attribute::SanitizeNumericalStability; case bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING: return Attribute::SpeculativeLoadHardening; case bitc::ATTR_KIND_SWIFT_ERROR: diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 35ea3c11396e7e..b08d5c50e5ae3e 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -828,6 +828,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_SANITIZE_THREAD; case Attribute::SanitizeMemory: return bitc::ATTR_KIND_SANITIZE_MEMORY; + case Attribute::SanitizeNumericalStability: + return bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY; case Attribute::SpeculativeLoadHardening: return bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING; case Attribute::SwiftError: diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index f2672b8e9118f5..b2775eb6c6c7a4 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -954,6 +954,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, case Attribute::ShadowCallStack: case Attribute::SanitizeAddress: case Attribute::SanitizeMemory: + case Attribute::SanitizeNumericalStability: case Attribute::SanitizeThread: case Attribute::SanitizeHWAddress: case Attribute::SanitizeMemTag: diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll index 2a846e036924c7..e437c37d8d1c87 100644 --- a/llvm/test/Bitcode/compatibility.ll +++ b/llvm/test/Bitcode/compatibility.ll @@ -1564,7 +1564,7 @@ exit: ; CHECK: select <2 x i1> , <2 x i8> , <2 x i8> call void @f.nobuiltin() builtin - ; CHECK: call void @f.nobuiltin() #51 + ; CHECK: call void @f.nobuiltin() #52 call fastcc noalias ptr @f.noalias() noinline ; CHECK: call fastcc noalias ptr @f.noalias() #12 @@ -1988,6 +1988,8 @@ declare void @f.nosanitize_bounds() nosanitize_bounds declare void @f.allockind() allockind("alloc,uninitialized") ; CHECK: declare void @f.allockind() #50 +declare void @f.sanitize_numerical_stability() sanitize_numerical_stability +; CHECK: declare void @f.sanitize_numerical_stability() #51 ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan)) declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan)) @@ -2110,7 +2112,8 @@ define float @nofpclass_callsites(float %arg) { ; CHECK: attributes #48 = { allocsize(1,0) } ; CHECK: attributes #49 = { nosanitize_bounds } ; CHECK: attributes #50 = { allockind("alloc,uninitialized") } -; CHECK: attributes #51 = { builtin } +; CHECK: attributes #51 = { sanitize_numerical_stability } +; CHECK: attributes #52 = { builtin } ;; Metadata From abbb24b0f3e33387461eba717de2c7296b0a19a6 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 10 Jun 2024 17:54:51 -0700 Subject: [PATCH 08/82] MCSection: Remove unused reverse iterators --- llvm/include/llvm/MC/MCSection.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h index 90effde5bb670a..aa648e97e22386 100644 --- a/llvm/include/llvm/MC/MCSection.h +++ b/llvm/include/llvm/MC/MCSection.h @@ -63,9 +63,6 @@ class MCSection { using const_iterator = FragmentListType::const_iterator; using iterator = FragmentListType::iterator; - using const_reverse_iterator = FragmentListType::const_reverse_iterator; - using reverse_iterator = FragmentListType::reverse_iterator; - private: MCSymbol *Begin; MCSymbol *End = nullptr; From 5275aed4d0195e5db214e6638d29b0b702d16b3c Mon Sep 17 00:00:00 2001 From: Freddy Ye Date: Tue, 11 Jun 2024 09:08:51 +0800 Subject: [PATCH 09/82] Reland "[X86] Assign AVX10_1 feature priority to align with gcc. (#94557)" (#94734) This reverts commit c007883f0286a314eb69976ad14da2bce988fb55. --- llvm/include/llvm/TargetParser/X86TargetParser.def | 4 ++-- llvm/lib/TargetParser/X86TargetParser.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 8daa8a689c95f5..f10cede0734e44 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -250,8 +250,8 @@ X86_FEATURE_COMPAT(SHA512, "sha512", 0) X86_FEATURE_COMPAT(SM4, "sm4", 0) X86_FEATURE (EGPR, "egpr") X86_FEATURE_COMPAT(USERMSR, "usermsr", 0) -X86_FEATURE_COMPAT(AVX10_1, "avx10.1-256", 0) -X86_FEATURE_COMPAT(AVX10_1_512, "avx10.1-512", 0) +X86_FEATURE_COMPAT(AVX10_1, "avx10.1-256", 36) +X86_FEATURE_COMPAT(AVX10_1_512, "avx10.1-512", 37) // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index e3802380d2beea..eda0c7f5da15ab 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -748,13 +748,13 @@ unsigned llvm::X86::getFeaturePriority(ProcessorFeatures Feat) { #ifndef NDEBUG // Check that priorities are set properly in the .def file. We expect that // "compat" features are assigned non-duplicate consecutive priorities - // starting from one (1, ..., 35) and multiple zeros. + // starting from one (1, ..., 37) and multiple zeros. #define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY) PRIORITY, unsigned Priorities[] = { #include "llvm/TargetParser/X86TargetParser.def" }; std::array HelperList; - const size_t MaxPriority = 35; + const size_t MaxPriority = 37; std::iota(HelperList.begin(), HelperList.begin() + MaxPriority + 1, 0); for (size_t i = MaxPriority + 1; i != std::size(Priorities); ++i) HelperList[i] = 0; From 8e12f31be5a98a66700dd3571e4e12465f05ad61 Mon Sep 17 00:00:00 2001 From: Fabian Mora Date: Mon, 10 Jun 2024 20:22:22 -0500 Subject: [PATCH 10/82] [mlir][gpu] Update LaunchFuncOp lowering in GPU to LLVM (#94991) This patch updates the lowering of `LaunchFuncOp` in GPU to LLVM to only legalize the operation with the converted operands, effectively removing the lowering used by the old serialization pipeline. It also removes all remaining uses of the old gpu serialization infrastructure in `gpu-to-llvm`. See [Compilation overview | 'gpu' Dialect - MLIR docs](https://mlir.llvm.org/docs/Dialects/GPU/#compilation-overview) for additional information on the target attributes compilation pipeline that replaced the old serialization pipeline. --- .../mlir/Conversion/GPUCommon/GPUCommonPass.h | 10 +- mlir/include/mlir/Conversion/Passes.td | 6 +- .../GPUCommon/GPUToLLVMConversion.cpp | 345 +++--------------- ...ower-launch-func-to-gpu-runtime-calls.mlir | 73 +--- 4 files changed, 66 insertions(+), 368 deletions(-) diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h index 48b7835ae5fca6..2d5e9d27c5bdfc 100644 --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -46,9 +46,6 @@ class LLVMDialect; #define GEN_PASS_DECL_GPUTOLLVMCONVERSIONPASS #include "mlir/Conversion/Passes.h.inc" -using OwnedBlob = std::unique_ptr>; -using BlobGenerator = - std::function; using LoweringCallback = std::function( Operation *, llvm::LLVMContext &, StringRef)>; @@ -66,10 +63,9 @@ struct FunctionCallBuilder { /// Collect a set of patterns to convert from the GPU dialect to LLVM and /// populate converter for gpu types. -void populateGpuToLLVMConversionPatterns( - LLVMTypeConverter &converter, RewritePatternSet &patterns, - StringRef gpuBinaryAnnotation = {}, bool kernelBarePtrCallConv = false, - SymbolTable *cachedModuleTable = nullptr); +void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, + RewritePatternSet &patterns, + bool kernelBarePtrCallConv = false); /// A function that maps a MemorySpace enum to a target-specific integer value. using MemorySpaceMapping = std::function; diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index eb58f4adc31d36..db67d6a5ff1287 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -478,11 +478,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> { /*default=*/"false", "Use bare pointers to pass memref arguments to kernels. " "The kernel must use the same setting for this option." - >, - Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string", - /*default=*/"gpu::getDefaultGpuBinaryAnnotation()", - "Annotation attribute string for GPU binary" - > + > ]; let dependentDialects = [ diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 82bfa9514a8841..92b28ff9c58737 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -49,8 +49,6 @@ namespace mlir { using namespace mlir; -static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; - namespace { class GpuToLLVMConversionPass : public impl::GpuToLLVMConversionPassBase { @@ -97,36 +95,6 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { Type llvmIntPtrType = IntegerType::get( context, this->getTypeConverter()->getPointerBitwidth(0)); - FunctionCallBuilder moduleLoadCallBuilder = { - "mgpuModuleLoad", - llvmPointerType /* void *module */, - {llvmPointerType /* void *cubin */, llvmInt64Type /* size_t size */}}; - FunctionCallBuilder moduleUnloadCallBuilder = { - "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; - FunctionCallBuilder moduleGetFunctionCallBuilder = { - "mgpuModuleGetFunction", - llvmPointerType /* void *function */, - { - llvmPointerType, /* void *module */ - llvmPointerType /* char *name */ - }}; - FunctionCallBuilder launchKernelCallBuilder = { - "mgpuLaunchKernel", - llvmVoidType, - { - llvmPointerType, /* void* f */ - llvmIntPtrType, /* intptr_t gridXDim */ - llvmIntPtrType, /* intptr_t gridyDim */ - llvmIntPtrType, /* intptr_t gridZDim */ - llvmIntPtrType, /* intptr_t blockXDim */ - llvmIntPtrType, /* intptr_t blockYDim */ - llvmIntPtrType, /* intptr_t blockZDim */ - llvmInt32Type, /* unsigned int sharedMemBytes */ - llvmPointerType, /* void *hstream */ - llvmPointerType, /* void **kernelParams */ - llvmPointerType, /* void **extra */ - llvmInt64Type /* size_t paramsCount */ - }}; FunctionCallBuilder streamCreateCallBuilder = { "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; FunctionCallBuilder streamDestroyCallBuilder = { @@ -451,55 +419,21 @@ class ConvertWaitAsyncOpToGpuRuntimeCallPattern ConversionPatternRewriter &rewriter) const override; }; -/// A rewrite patter to convert gpu.launch_func operations into a sequence of -/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). -/// -/// In essence, a gpu.launch_func operations gets compiled into the following -/// sequence of runtime calls: -/// -/// * moduleLoad -- loads the module given the cubin / hsaco data -/// * moduleGetFunction -- gets a handle to the actual kernel function -/// * getStreamHelper -- initializes a new compute stream on GPU -/// * launchKernel -- launches the kernel on a stream -/// * streamSynchronize -- waits for operations on the stream to finish -/// -/// Intermediate data structures are allocated on the stack. -class ConvertLaunchFuncOpToGpuRuntimeCallPattern +/// A rewrite patter to legalize gpu.launch_func with LLVM types. +class LegalizeLaunchFuncOpPattern : public ConvertOpToGpuRuntimeCallPattern { public: - ConvertLaunchFuncOpToGpuRuntimeCallPattern( - const LLVMTypeConverter &typeConverter, StringRef gpuBinaryAnnotation, - bool kernelBarePtrCallConv, SymbolTable *cachedModuleTable) + LegalizeLaunchFuncOpPattern(const LLVMTypeConverter &typeConverter, + bool kernelBarePtrCallConv) : ConvertOpToGpuRuntimeCallPattern(typeConverter), - gpuBinaryAnnotation(gpuBinaryAnnotation), - kernelBarePtrCallConv(kernelBarePtrCallConv), - cachedModuleTable(cachedModuleTable) {} + kernelBarePtrCallConv(kernelBarePtrCallConv) {} private: - Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, - OpBuilder &builder) const; - Value generateKernelNameConstant(StringRef moduleName, StringRef name, - Location loc, OpBuilder &builder) const; - LogicalResult matchAndRewrite(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; - llvm::SmallString<32> gpuBinaryAnnotation; bool kernelBarePtrCallConv; - SymbolTable *cachedModuleTable; -}; - -class EraseGpuModuleOpPattern : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(gpu::GPUModuleOp op, - PatternRewriter &rewriter) const override { - // GPU kernel modules are no longer necessary since we have a global - // constant with the CUBIN, or HSACO data. - rewriter.eraseOp(op); - return success(); - } }; /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime @@ -587,7 +521,6 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SetCsrPointersOp) void GpuToLLVMConversionPass::runOnOperation() { MLIRContext *context = &getContext(); - SymbolTable symbolTable = SymbolTable(getOperation()); LowerToLLVMOptions options(context); options.useBarePtrCallConv = hostBarePtrCallConv; RewritePatternSet patterns(context); @@ -604,30 +537,20 @@ void GpuToLLVMConversionPass::runOnOperation() { iface->populateConvertToLLVMConversionPatterns(target, converter, patterns); } - // Preserve GPU modules if they have target attributes. - target.addDynamicallyLegalOp( - [](gpu::GPUModuleOp module) -> bool { - return module.getTargetsAttr() != nullptr; - }); - // Accept as legal LaunchFuncOps if they refer to GPU Modules with targets and - // the operands have been lowered. + // Preserve GPU modules and binaries. Modules are preserved as they can be + // converted later by `gpu-module-to-binary`. + target.addLegalOp(); + // Accept as legal LaunchFuncOps if the operands have been lowered. target.addDynamicallyLegalOp( - [&](gpu::LaunchFuncOp op) -> bool { - auto module = - symbolTable.lookup(op.getKernelModuleName()); - return converter.isLegal(op->getOperandTypes()) && - converter.isLegal(op->getResultTypes()) && - (module && module.getTargetsAttr() && - !module.getTargetsAttr().empty()); - }); + [&](gpu::LaunchFuncOp op) -> bool { return converter.isLegal(op); }); // These aren't covered by the ConvertToLLVMPatternInterface right now. populateVectorToLLVMConversionPatterns(converter, patterns); populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, target); - populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation, - kernelBarePtrCallConv, &symbolTable); + populateGpuToLLVMConversionPatterns(converter, patterns, + kernelBarePtrCallConv); if (failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) @@ -1002,100 +925,8 @@ LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite( return success(); } -// Creates a struct containing all kernel parameters on the stack and returns -// an array of type-erased pointers to the fields of the struct. The array can -// then be passed to the CUDA / ROCm (HIP) kernel launch calls. -// The generated code is essentially as follows: -// -// %struct = alloca(sizeof(struct { Parameters... })) -// %array = alloca(NumParameters * sizeof(void *)) -// for (i : [0, NumParameters)) -// %fieldPtr = llvm.getelementptr %struct[0, i] -// llvm.store parameters[i], %fieldPtr -// %elementPtr = llvm.getelementptr %array[i] -// llvm.store %fieldPtr, %elementPtr -// return %array -Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( - gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, OpBuilder &builder) const { - auto loc = launchOp.getLoc(); - auto numKernelOperands = launchOp.getNumKernelOperands(); - // Note: If `useBarePtrCallConv` is set in the type converter's options, - // the value of `kernelBarePtrCallConv` will be ignored. - SmallVector arguments = getTypeConverter()->promoteOperands( - loc, launchOp.getOperands().take_back(numKernelOperands), - adaptor.getOperands().take_back(numKernelOperands), builder, - /*useBarePtrCallConv=*/kernelBarePtrCallConv); - auto numArguments = arguments.size(); - SmallVector argumentTypes; - argumentTypes.reserve(numArguments); - for (auto argument : arguments) - argumentTypes.push_back(argument.getType()); - auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(), - argumentTypes); - auto one = builder.create(loc, llvmInt32Type, 1); - auto structPtr = - builder.create(loc, llvmPointerType, structType, one, - /*alignment=*/0); - auto arraySize = - builder.create(loc, llvmInt32Type, numArguments); - auto arrayPtr = builder.create( - loc, llvmPointerType, llvmPointerType, arraySize, /*alignment=*/0); - for (const auto &en : llvm::enumerate(arguments)) { - const auto index = static_cast(en.index()); - Value fieldPtr = - builder.create(loc, llvmPointerType, structType, structPtr, - ArrayRef{0, index}); - builder.create(loc, en.value(), fieldPtr); - auto elementPtr = - builder.create(loc, llvmPointerType, llvmPointerType, - arrayPtr, ArrayRef{index}); - builder.create(loc, fieldPtr, elementPtr); - } - return arrayPtr; -} - -// Generates an LLVM IR dialect global that contains the name of the given -// kernel function as a C string, and returns a pointer to its beginning. -// The code is essentially: -// -// llvm.global constant @kernel_name("function_name\00") -// func(...) { -// %0 = llvm.addressof @kernel_name -// %1 = llvm.constant (0 : index) -// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> -// } -Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( - StringRef moduleName, StringRef name, Location loc, - OpBuilder &builder) const { - // Make sure the trailing zero is included in the constant. - std::vector kernelName(name.begin(), name.end()); - kernelName.push_back('\0'); - - std::string globalName = - std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); - return LLVM::createGlobalString( - loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), - LLVM::Linkage::Internal); -} - -// Emits LLVM IR to launch a kernel function. Expects the module that contains -// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a -// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. -// -// %0 = call %binarygetter -// %1 = call %moduleLoad(%0) -// %2 = -// %3 = call %moduleGetFunction(%1, %2) -// %4 = call %streamCreate() -// %5 = -// call %launchKernel(%3, , 0, %4, %5, nullptr) -// call %streamSynchronize(%4) -// call %streamDestroy(%4) -// call %moduleUnload(%1) -// -// If the op is async, the stream corresponds to the (single) async dependency -// as well as the async token the op produces. -LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( +// Legalize the op's operands. +LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite( gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter))) @@ -1114,123 +945,37 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( Location loc = launchOp.getLoc(); - // Create an LLVM global with CUBIN extracted from the kernel annotation and - // obtain a pointer to the first byte in it. - gpu::GPUModuleOp kernelModule; - if (cachedModuleTable) - kernelModule = cachedModuleTable->lookup( - launchOp.getKernelModuleName()); - else - kernelModule = SymbolTable::lookupNearestSymbolFrom( - launchOp, launchOp.getKernelModuleName()); - assert(kernelModule && "expected a kernel module"); - - // If the module has Targets then just update the op operands. - if (ArrayAttr targets = kernelModule.getTargetsAttr()) { - Value stream = Value(); - if (!adaptor.getAsyncDependencies().empty()) - stream = adaptor.getAsyncDependencies().front(); - // If the async keyword is present and there are no dependencies, then a - // stream must be created to pass to subsequent operations. - else if (launchOp.getAsyncToken()) - stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(); - - // Lower the kernel operands to match kernel parameters. - // Note: If `useBarePtrCallConv` is set in the type converter's options, - // the value of `kernelBarePtrCallConv` will be ignored. - SmallVector arguments = getTypeConverter()->promoteOperands( - loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(), - rewriter, /*useBarePtrCallConv=*/kernelBarePtrCallConv); - - std::optional clusterSize = std::nullopt; - if (launchOp.hasClusterSize()) { - clusterSize = - gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(), - adaptor.getClusterSizeZ()}; - } - rewriter.create( - launchOp.getLoc(), launchOp.getKernelAttr(), - gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(), - adaptor.getGridSizeZ()}, - gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), - adaptor.getBlockSizeZ()}, - adaptor.getDynamicSharedMemorySize(), arguments, stream, clusterSize); - if (launchOp.getAsyncToken()) - rewriter.replaceOp(launchOp, {stream}); - else - rewriter.eraseOp(launchOp); - return success(); - } + Value stream = Value(); + if (!adaptor.getAsyncDependencies().empty()) + stream = adaptor.getAsyncDependencies().front(); + // If the async keyword is present and there are no dependencies, then a + // stream must be created to pass to subsequent operations. + else if (launchOp.getAsyncToken()) + stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(); + // Lower the kernel operands to match kernel parameters. + // Note: If `useBarePtrCallConv` is set in the type converter's options, + // the value of `kernelBarePtrCallConv` will be ignored. + SmallVector arguments = getTypeConverter()->promoteOperands( + loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(), rewriter, + /*useBarePtrCallConv=*/kernelBarePtrCallConv); - auto binaryAttr = - kernelModule->getAttrOfType(gpuBinaryAnnotation); - if (!binaryAttr) { - kernelModule.emitOpError() - << "missing " << gpuBinaryAnnotation << " attribute"; - return failure(); + std::optional clusterSize = std::nullopt; + if (launchOp.hasClusterSize()) { + clusterSize = + gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(), + adaptor.getClusterSizeZ()}; } - - SmallString<128> nameBuffer(kernelModule.getName()); - nameBuffer.append(kGpuBinaryStorageSuffix); - Value data = - LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), - binaryAttr.getValue(), LLVM::Linkage::Internal); - - // Pass the binary size. SPIRV requires binary size. - auto gpuBlob = binaryAttr.getValue(); - auto gpuBlobSize = rewriter.create( - loc, llvmInt64Type, - mlir::IntegerAttr::get(llvmInt64Type, - static_cast(gpuBlob.size()))); - - auto module = - moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize}); - - // Pass the count of the parameters to runtime wrappers - auto paramsCount = rewriter.create( - loc, llvmInt64Type, - mlir::IntegerAttr::get( - llvmInt64Type, - static_cast(launchOp.getNumKernelOperands()))); - - // Get the function from the module. The name corresponds to the name of - // the kernel function. - auto kernelName = generateKernelNameConstant( - launchOp.getKernelModuleName().getValue(), - launchOp.getKernelName().getValue(), loc, rewriter); - auto function = moduleGetFunctionCallBuilder.create( - loc, rewriter, {module.getResult(), kernelName}); - Value zero = rewriter.create(loc, llvmInt32Type, 0); - Value stream = - adaptor.getAsyncDependencies().empty() - ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult() - : adaptor.getAsyncDependencies().front(); - // Create array of pointers to kernel arguments. - auto kernelParams = generateParamsArray(launchOp, adaptor, rewriter); - auto nullpointer = rewriter.create(loc, llvmPointerType); - Value dynamicSharedMemorySize = launchOp.getDynamicSharedMemorySize() - ? launchOp.getDynamicSharedMemorySize() - : zero; - launchKernelCallBuilder.create( - loc, rewriter, - {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), - adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), - adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, - /*extra=*/nullpointer, paramsCount}); - - if (launchOp.getAsyncToken()) { - // Async launch: make dependent ops use the same stream. + rewriter.create( + launchOp.getLoc(), launchOp.getKernelAttr(), + gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(), + adaptor.getGridSizeZ()}, + gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), + adaptor.getBlockSizeZ()}, + adaptor.getDynamicSharedMemorySize(), arguments, stream, clusterSize); + if (launchOp.getAsyncToken()) rewriter.replaceOp(launchOp, {stream}); - } else { - // Synchronize with host and destroy stream. This must be the stream created - // above (with no other uses) because we check that the synchronous version - // does not have any async dependencies. - streamSynchronizeCallBuilder.create(loc, rewriter, stream); - streamDestroyCallBuilder.create(loc, rewriter, stream); + else rewriter.eraseOp(launchOp); - } - moduleUnloadCallBuilder.create(loc, rewriter, module.getResult()); - return success(); } @@ -1978,9 +1723,7 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite( void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, - StringRef gpuBinaryAnnotation, - bool kernelBarePtrCallConv, - SymbolTable *cachedModuleTable) { + bool kernelBarePtrCallConv) { addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); @@ -2017,7 +1760,5 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, ConvertSpGEMMCopyOpToGpuRuntimeCallPattern, ConvertSpMatGetSizeOpToGpuRuntimeCallPattern, ConvertSetCsrPointersOpToGpuRuntimeCallPattern>(converter); - patterns.add( - converter, gpuBinaryAnnotation, kernelBarePtrCallConv, cachedModuleTable); - patterns.add(&converter.getContext()); + patterns.add(converter, kernelBarePtrCallConv); } diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir index c0b05ef0860333..6c5c1e09c0eb5f 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -1,15 +1,8 @@ -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" -split-input-file | FileCheck %s -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" -split-input-file | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s module attributes {gpu.container_module} { - - // CHECK: llvm.mlir.global internal constant @[[KERNEL_NAME:.*]]("kernel\00") - // CHECK: llvm.mlir.global internal constant @[[GLOBAL:.*]]("CUBIN") - // ROCDL: llvm.mlir.global internal constant @[[GLOBAL:.*]]("HSACO") - - gpu.module @kernel_module attributes { - nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO" - } { + // CHECK: gpu.module + gpu.module @kernel_module [#nvvm.target] { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, %arg5: i64) attributes {gpu.kernel} { @@ -18,9 +11,17 @@ module attributes {gpu.container_module} { } func.func @foo(%buffer: memref) { + // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64 + // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32 %c8 = arith.constant 8 : index %c32 = arith.constant 32 : i32 %c256 = arith.constant 256 : i32 + + // CHECK: gpu.launch_func @kernel_module::@kernel + // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) threads in ([[C8]], [[C8]], [[C8]]) : i64 + // CHECK: dynamic_shared_memory_size [[C256]] + // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64) gpu.launch_func @kernel_module::@kernel blocks in (%c8, %c8, %c8) threads in (%c8, %c8, %c8) @@ -28,46 +29,13 @@ module attributes {gpu.container_module} { args(%c32 : i32, %buffer : memref) return } - - // CHECK-DAG: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32 - // CHECK-DAG: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64 - // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]] - // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0] - // CHECK-SAME: -> !llvm.ptr - // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant - // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]]) - // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant - // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}}) - - // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate - - // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : i32) - // CHECK: %[[MEMREF:.*]] = llvm.alloca %[[ONE]] x !llvm.struct[[STRUCT_BODY:<.*>]] - // CHECK: [[NUM_PARAMS:%.*]] = llvm.mlir.constant(6 : i32) : i32 - // CHECK-NEXT: [[PARAMS:%.*]] = llvm.alloca [[NUM_PARAMS]] x !llvm.ptr - - // CHECK: llvm.getelementptr %[[MEMREF]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] - // CHECK: llvm.getelementptr %[[MEMREF]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] - // CHECK: llvm.getelementptr %[[MEMREF]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] - // CHECK: llvm.getelementptr %[[MEMREF]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] - // CHECK: llvm.getelementptr %[[MEMREF]][0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] - // CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] - - // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.zero : !llvm.ptr - - // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]], - // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]], - // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]]) - // CHECK: llvm.call @mgpuStreamSynchronize - // CHECK: llvm.call @mgpuStreamDestroy - // CHECK: llvm.call @mgpuModuleUnload } + // ----- module attributes {gpu.container_module} { // CHECK: gpu.module - // ROCDL: gpu.module gpu.module @kernel_module [#nvvm.target] { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, @@ -80,15 +48,19 @@ module attributes {gpu.container_module} { // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64 // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32 - %c8 = arith.constant 8 : index + // CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64 + %c8 = arith.constant 8 : index %c32 = arith.constant 32 : i32 %c256 = arith.constant 256 : i32 + %c2 = arith.constant 2 : index // CHECK: gpu.launch_func @kernel_module::@kernel + // CHECK: clusters in ([[C2]], [[C2]], [[C2]]) // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) threads in ([[C8]], [[C8]], [[C8]]) : i64 // CHECK: dynamic_shared_memory_size [[C256]] // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64) gpu.launch_func @kernel_module::@kernel + clusters in (%c2, %c2, %c2) blocks in (%c8, %c8, %c8) threads in (%c8, %c8, %c8) dynamic_shared_memory_size %c256 @@ -97,18 +69,11 @@ module attributes {gpu.container_module} { } } - // ----- module attributes {gpu.container_module} { - // CHECK: gpu.module - gpu.module @kernel_module [#nvvm.target] { - llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, - %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, - %arg5: i64) attributes {gpu.kernel} { - llvm.return - } - } + // CHECK: gpu.binary + gpu.binary @kernel_module [#gpu.object<#rocdl.target, "blob">] func.func @foo(%buffer: memref) { // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64 From cb63abca27e1813ae58ded466cd81ba3952ab888 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 10 Jun 2024 18:27:34 -0700 Subject: [PATCH 11/82] [MC] Remove getFragmentList uses. NFC --- llvm/include/llvm/MC/MCSection.h | 1 + llvm/lib/MC/MCAssembler.cpp | 2 +- llvm/lib/MC/MCObjectStreamer.cpp | 2 +- llvm/lib/MC/MCSection.cpp | 9 ++++++--- llvm/lib/MC/MachObjectWriter.cpp | 8 +++----- llvm/lib/MC/WasmObjectWriter.cpp | 9 ++------- llvm/lib/MC/WinCOFFObjectWriter.cpp | 2 +- llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 2 +- .../Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp | 5 ++--- llvm/tools/dsymutil/MachOUtils.cpp | 2 +- 10 files changed, 19 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h index aa648e97e22386..217b9b4b5bc52b 100644 --- a/llvm/include/llvm/MC/MCSection.h +++ b/llvm/include/llvm/MC/MCSection.h @@ -189,6 +189,7 @@ class MCSection { iterator end() { return Fragments.end(); } const_iterator end() const { return Fragments.end(); } + bool empty() const { return Fragments.empty(); } void addFragment(MCFragment &F) { Fragments.push_back(&F); } diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index a7157e7a37b439..8490853eda87c2 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -820,7 +820,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) { for (MCSection &Sec : *this) { // Create dummy fragments to eliminate any empty sections, this simplifies // layout. - if (Sec.getFragmentList().empty()) + if (Sec.empty()) new MCDataFragment(&Sec); Sec.setOrdinal(SectionIndex++); diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 8d3873bed9efdd..ae4e6915fa294c 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -202,7 +202,7 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) { MCFragment *MCObjectStreamer::getCurrentFragment() const { assert(getCurrentSectionOnly() && "No current section!"); - if (CurInsertionPoint != getCurrentSectionOnly()->getFragmentList().begin()) + if (CurInsertionPoint != getCurrentSectionOnly()->begin()) return &*std::prev(CurInsertionPoint); return nullptr; diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp index 12e69f70537b78..9848d7fafe764a 100644 --- a/llvm/lib/MC/MCSection.cpp +++ b/llvm/lib/MC/MCSection.cpp @@ -130,10 +130,13 @@ LLVM_DUMP_METHOD void MCSection::dump() const { OS << "dump(); + F.dump(); } OS << "]>"; } diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index d17e6e125d8727..de8bde4211b49b 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -767,11 +767,9 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm, if (!Asm.CGProfile.empty()) { MCSection *CGProfileSection = Asm.getContext().getMachOSection( "__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); - MCDataFragment *Frag = dyn_cast_or_null( - &*CGProfileSection->getFragmentList().begin()); - assert(Frag && "call graph profile section not reserved"); - Frag->getContents().clear(); - raw_svector_ostream OS(Frag->getContents()); + auto &Frag = cast(*CGProfileSection->begin()); + Frag.getContents().clear(); + raw_svector_ostream OS(Frag.getContents()); for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) { uint32_t FromIndex = CGPE.From->getSymbol().getIndex(); uint32_t ToIndex = CGPE.To->getSymbol().getIndex(); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 788e92f94b2689..451269608f1799 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -1857,14 +1857,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, report_fatal_error(".fini_array sections are unsupported"); if (!WS.getName().starts_with(".init_array")) continue; - if (WS.getFragmentList().empty()) - continue; - - // init_array is expected to contain a single non-empty data fragment - if (WS.getFragmentList().size() != 3) - report_fatal_error("only one .init_array section fragment supported"); - auto IT = WS.begin(); + if (IT == WS.end()) + continue; const MCFragment &EmptyFrag = *IT; if (EmptyFrag.getKind() != MCFragment::FT_Data) report_fatal_error(".init_array section should be aligned"); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index 3c9ff71b6b0622..a2b6c4e5c3a5c5 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -354,7 +354,7 @@ void WinCOFFWriter::defineSection(const MCSectionCOFF &MCSec, Section->MCSection = &MCSec; SectionMap[&MCSec] = Section; - if (UseOffsetLabels && !MCSec.getFragmentList().empty()) { + if (UseOffsetLabels && !MCSec.empty()) { const uint32_t Interval = 1 << OffsetLabelIntervalBits; uint32_t N = 1; for (uint32_t Off = Interval, E = Layout.getSectionAddressSize(&MCSec); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index de7449a400a741..b6cecccf3572dd 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -329,7 +329,7 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx, MCSectionELF *TextSection = static_cast(Ctx.getObjectFileInfo()->getTextSection()); if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions()) { - for (auto &F : TextSection->getFragmentList()) + for (auto &F : *TextSection) if (auto *DF = dyn_cast(&F)) if (!DF->getContents().empty()) return; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index f9a0ba3608e6dc..3b6ea81cdf10ed 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -713,15 +713,14 @@ class HexagonAsmBackend : public MCAsmBackend { void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override { for (auto *I : Layout.getSectionOrder()) { - auto &Fragments = I->getFragmentList(); - for (auto &J : Fragments) { + for (auto &J : *I) { switch (J.getKind()) { default: break; case MCFragment::FT_Align: { auto Size = Asm.computeFragmentSize(Layout, J); for (auto K = J.getIterator(); - K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) { + K != I->begin() && Size >= HEXAGON_PACKET_SIZE;) { --K; switch (K->getKind()) { default: diff --git a/llvm/tools/dsymutil/MachOUtils.cpp b/llvm/tools/dsymutil/MachOUtils.cpp index 8e144d640ed01f..b52ab1ce6d2949 100644 --- a/llvm/tools/dsymutil/MachOUtils.cpp +++ b/llvm/tools/dsymutil/MachOUtils.cpp @@ -630,7 +630,7 @@ bool generateDsymCompanion( // Emit the Dwarf sections contents. for (const MCSection &Sec : MCAsm) { - if (Sec.begin() == Sec.end()) + if (Sec.empty()) continue; uint64_t Pos = OutFile.tell(); From 69e9e779b783bb34a3c1f73c93ca63ee6b89ab09 Mon Sep 17 00:00:00 2001 From: Pavel Samolysov Date: Tue, 11 Jun 2024 05:30:50 +0300 Subject: [PATCH 12/82] [clang] Replace X && isa(X) with isa_and_nonnull(X). NFC (#94987) This addresses a clang-tidy suggestion. --- .../clang/Analysis/Analyses/ThreadSafetyCommon.h | 4 ++-- clang/include/clang/Lex/Preprocessor.h | 2 +- clang/include/clang/Sema/SemaObjC.h | 2 +- .../StaticAnalyzer/Core/PathSensitive/MemRegion.h | 2 +- clang/lib/ARCMigrate/TransUnbridgedCasts.cpp | 2 +- clang/lib/AST/ASTImporter.cpp | 2 +- clang/lib/AST/DeclBase.cpp | 2 +- clang/lib/AST/Expr.cpp | 4 ++-- clang/lib/AST/ExprConstant.cpp | 2 +- clang/lib/AST/Mangle.cpp | 2 +- clang/lib/AST/MicrosoftMangle.cpp | 2 +- clang/lib/AST/ParentMap.cpp | 7 +++++-- clang/lib/AST/StmtPrinter.cpp | 4 ++-- clang/lib/CodeGen/CGBlocks.cpp | 2 +- clang/lib/CodeGen/CGClass.cpp | 4 ++-- clang/lib/CodeGen/CGExprConstant.cpp | 2 +- clang/lib/CodeGen/CGStmtOpenMP.cpp | 6 +++--- clang/lib/CodeGen/CodeGenFunction.cpp | 2 +- clang/lib/Index/IndexBody.cpp | 2 +- clang/lib/Lex/PPMacroExpansion.cpp | 2 +- clang/lib/Sema/AnalysisBasedWarnings.cpp | 8 ++++---- clang/lib/Sema/SemaCXXScopeSpec.cpp | 2 +- clang/lib/Sema/SemaChecking.cpp | 10 +++++----- clang/lib/Sema/SemaExprCXX.cpp | 8 ++++---- clang/lib/Sema/SemaInit.cpp | 6 +++--- clang/lib/Sema/SemaStmt.cpp | 2 +- clang/lib/Sema/SemaTemplate.cpp | 2 +- 27 files changed, 49 insertions(+), 46 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h index 7bdb9052e57e74..e99c5b2466334a 100644 --- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h +++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h @@ -330,9 +330,9 @@ class CapabilityExpr { bool shouldIgnore() const { return sexpr() == nullptr; } - bool isInvalid() const { return sexpr() && isa(sexpr()); } + bool isInvalid() const { return isa_and_nonnull(sexpr()); } - bool isUniversal() const { return sexpr() && isa(sexpr()); } + bool isUniversal() const { return isa_and_nonnull(sexpr()); } }; // Translate clang::Expr to til::SExpr. diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index c0850a8fa9f7f8..9b1628d2d86f9e 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -1360,7 +1360,7 @@ class Preprocessor { MacroState &S = CurSubmoduleState->Macros[II]; auto *MD = S.getLatest(); - while (MD && isa(MD)) + while (isa_and_nonnull(MD)) MD = MD->getPrevious(); return MacroDefinition(dyn_cast_or_null(MD), S.getActiveModuleMacros(*this, II), diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h index 91430797e5ed82..bb8887691ce5d3 100644 --- a/clang/include/clang/Sema/SemaObjC.h +++ b/clang/include/clang/Sema/SemaObjC.h @@ -383,7 +383,7 @@ class SemaObjC : public SemaBase { void AddAnyMethodToGlobalPool(Decl *D); void ActOnStartOfObjCMethodDef(Scope *S, Decl *D); - bool isObjCMethodDecl(Decl *D) { return D && isa(D); } + bool isObjCMethodDecl(Decl *D) { return isa_and_nonnull(D); } /// CheckImplementationIvars - This routine checks if the instance variables /// listed in the implelementation match those listed in the interface. diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h index 151d3e57c1cb81..59805d01be5db7 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h @@ -781,7 +781,7 @@ class SymbolicRegion : public SubRegion { : SubRegion(sreg, SymbolicRegionKind), sym(s) { // Because pointer arithmetic is represented by ElementRegion layers, // the base symbol here should not contain any arithmetic. - assert(s && isa(s)); + assert(isa_and_nonnull(s)); assert(s->getType()->isAnyPointerType() || s->getType()->isReferenceType() || s->getType()->isBlockPointerType()); diff --git a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp index 1e6354f71e294a..7390ea17c8a4b6 100644 --- a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp +++ b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp @@ -371,7 +371,7 @@ class UnbridgedCastRewriter : public RecursiveASTVisitor{ Stmt *parent = E; do { parent = StmtMap->getParentIgnoreParenImpCasts(parent); - } while (parent && isa(parent)); + } while (isa_and_nonnull(parent)); if (ReturnStmt *retS = dyn_cast_or_null(parent)) { std::string note = "remove the cast and change return type of function " diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 3b9080e09b3313..02cd4ed9a6cace 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1505,7 +1505,7 @@ ExpectedType ASTNodeImporter::VisitInjectedClassNameType( // The InjectedClassNameType is created in VisitRecordDecl when the // T->getDecl() is imported. Here we can return the existing type. const Type *Ty = (*ToDeclOrErr)->getTypeForDecl(); - assert(Ty && isa(Ty)); + assert(isa_and_nonnull(Ty)); return QualType(Ty, 0); } diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index 7f1ed9c691e988..e64a8326e8d5dd 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1116,7 +1116,7 @@ bool Decl::isInExportDeclContext() const { while (DC && !isa(DC)) DC = DC->getLexicalParent(); - return DC && isa(DC); + return isa_and_nonnull(DC); } bool Decl::isInAnotherModuleUnit() const { diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index f9d634550dc061..7e555689b64c48 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -837,7 +837,7 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, typedef SmallVector SpecsTy; SpecsTy Specs; const DeclContext *Ctx = FD->getDeclContext(); - while (Ctx && isa(Ctx)) { + while (isa_and_nonnull(Ctx)) { const ClassTemplateSpecializationDecl *Spec = dyn_cast(Ctx); if (Spec && !Spec->isExplicitSpecialization()) @@ -3067,7 +3067,7 @@ Expr *Expr::IgnoreParenCasts() { Expr *Expr::IgnoreConversionOperatorSingleStep() { if (auto *MCE = dyn_cast(this)) { - if (MCE->getMethodDecl() && isa(MCE->getMethodDecl())) + if (isa_and_nonnull(MCE->getMethodDecl())) return MCE->getImplicitObjectArgument(); } return this; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 86fb396fabe2d9..d5057452cec9c5 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -2130,7 +2130,7 @@ static bool IsWeakLValue(const LValue &Value) { static bool isZeroSized(const LValue &Value) { const ValueDecl *Decl = GetLValueBaseDecl(Value); - if (Decl && isa(Decl)) { + if (isa_and_nonnull(Decl)) { QualType Ty = Decl->getType(); if (Ty->isArrayType()) return Ty->isIncompleteType() || diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index 30cff1ba2e6f37..4af4d7c00c5cb3 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -302,7 +302,7 @@ void MangleContext::mangleBlock(const DeclContext *DC, const BlockDecl *BD, assert((isa(DC) || isa(DC)) && "expected a NamedDecl or BlockDecl"); if (isa(DC)) - for (; DC && isa(DC); DC = DC->getParent()) + for (; isa_and_nonnull(DC); DC = DC->getParent()) (void) getBlockId(cast(DC), true); assert((isa(DC) || isa(DC)) && "expected a TranslationUnitDecl or a NamedDecl"); diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 36d611750ca48c..2f7a2763639207 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -2748,7 +2748,7 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T, return; } Out << '@'; - } else if (IsInLambda && D && isa(D)) { + } else if (IsInLambda && isa_and_nonnull(D)) { // The only lambda conversion operators are to function pointers, which // can differ by their calling convention and are typically deduced. So // we make sure that this type gets mangled properly. diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp index 3d6a1cc84c7b10..e97cb5e226f5c2 100644 --- a/clang/lib/AST/ParentMap.cpp +++ b/clang/lib/AST/ParentMap.cpp @@ -139,7 +139,9 @@ Stmt* ParentMap::getParent(Stmt* S) const { } Stmt *ParentMap::getParentIgnoreParens(Stmt *S) const { - do { S = getParent(S); } while (S && isa(S)); + do { + S = getParent(S); + } while (isa_and_nonnull(S)); return S; } @@ -155,7 +157,8 @@ Stmt *ParentMap::getParentIgnoreParenCasts(Stmt *S) const { Stmt *ParentMap::getParentIgnoreParenImpCasts(Stmt *S) const { do { S = getParent(S); - } while (S && isa(S) && cast(S)->IgnoreParenImpCasts() != S); + } while (isa_and_nonnull(S) && + cast(S)->IgnoreParenImpCasts() != S); return S; } diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 7e030e05512690..8f51d16b5db037 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -84,7 +84,7 @@ namespace { void PrintStmt(Stmt *S, int SubIndent) { IndentLevel += SubIndent; - if (S && isa(S)) { + if (isa_and_nonnull(S)) { // If this is an expr used in a stmt context, indent and newline it. Indent(); Visit(S); @@ -1939,7 +1939,7 @@ void StmtPrinter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *Node) { void StmtPrinter::VisitCXXMemberCallExpr(CXXMemberCallExpr *Node) { // If we have a conversion operator call only print the argument. CXXMethodDecl *MD = Node->getMethodDecl(); - if (MD && isa(MD)) { + if (isa_and_nonnull(MD)) { PrintExpr(Node->getImplicitObjectArgument()); return; } diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index bf50f2025de573..5dac1cd425bf61 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -577,7 +577,7 @@ static void computeBlockInfo(CodeGenModule &CGM, CodeGenFunction *CGF, // First, 'this'. if (block->capturesCXXThis()) { - assert(CGF && CGF->CurFuncDecl && isa(CGF->CurFuncDecl) && + assert(CGF && isa_and_nonnull(CGF->CurFuncDecl) && "Can't capture 'this' outside a method"); QualType thisType = cast(CGF->CurFuncDecl)->getThisType(); diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index b8cb78266130c8..5a032bdbf93791 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -859,7 +859,7 @@ void CodeGenFunction::EmitConstructorBody(FunctionArgList &Args) { // Enter the function-try-block before the constructor prologue if // applicable. - bool IsTryBody = (Body && isa(Body)); + bool IsTryBody = isa_and_nonnull(Body); if (IsTryBody) EnterCXXTryStmt(*cast(Body), true); @@ -1475,7 +1475,7 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) { // If the body is a function-try-block, enter the try before // anything else. - bool isTryBody = (Body && isa(Body)); + bool isTryBody = isa_and_nonnull(Body); if (isTryBody) EnterCXXTryStmt(*cast(Body), true); EmitAsanPrologueOrEpilogue(false); diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp index 4eb65b34a89f56..0712f40fd8215a 100644 --- a/clang/lib/CodeGen/CGExprConstant.cpp +++ b/clang/lib/CodeGen/CGExprConstant.cpp @@ -715,7 +715,7 @@ bool ConstStructBuilder::Build(const InitListExpr *ILE, bool AllowOverwrite) { const Expr *Init = nullptr; if (ElementNo < ILE->getNumInits()) Init = ILE->getInit(ElementNo++); - if (Init && isa(Init)) + if (isa_and_nonnull(Init)) continue; // Zero-sized fields are not emitted, but their initializers may still diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 6410f9e102c907..f73d32de7c4848 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -72,7 +72,7 @@ class OMPLexicalScope : public CodeGenFunction::LexicalScope { static bool isCapturedVar(CodeGenFunction &CGF, const VarDecl *VD) { return CGF.LambdaCaptureFields.lookup(VD) || (CGF.CapturedStmtInfo && CGF.CapturedStmtInfo->lookup(VD)) || - (CGF.CurCodeDecl && isa(CGF.CurCodeDecl) && + (isa_and_nonnull(CGF.CurCodeDecl) && cast(CGF.CurCodeDecl)->capturesVariable(VD)); } @@ -227,7 +227,7 @@ class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope { static bool isCapturedVar(CodeGenFunction &CGF, const VarDecl *VD) { return CGF.LambdaCaptureFields.lookup(VD) || (CGF.CapturedStmtInfo && CGF.CapturedStmtInfo->lookup(VD)) || - (CGF.CurCodeDecl && isa(CGF.CurCodeDecl) && + (isa_and_nonnull(CGF.CurCodeDecl) && cast(CGF.CurCodeDecl)->capturesVariable(VD)); } @@ -315,7 +315,7 @@ LValue CodeGenFunction::EmitOMPSharedLValue(const Expr *E) { bool IsCaptured = LambdaCaptureFields.lookup(OrigVD) || (CapturedStmtInfo && CapturedStmtInfo->lookup(OrigVD)) || - (CurCodeDecl && isa(CurCodeDecl)); + (isa_and_nonnull(CurCodeDecl)); DeclRefExpr DRE(getContext(), const_cast(OrigVD), IsCaptured, OrigDRE->getType(), VK_LValue, OrigDRE->getExprLoc()); return EmitLValue(&DRE); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index f0345f3b191b88..f84b3b08220fd9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -2951,7 +2951,7 @@ void CodeGenFunction::emitAlignmentAssumptionCheck( SourceLocation SecondaryLoc, llvm::Value *Alignment, llvm::Value *OffsetValue, llvm::Value *TheCheck, llvm::Instruction *Assumption) { - assert(Assumption && isa(Assumption) && + assert(isa_and_nonnull(Assumption) && cast(Assumption)->getCalledOperand() == llvm::Intrinsic::getDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp index 08136baa5d408e..c18daf7faa7497 100644 --- a/clang/lib/Index/IndexBody.cpp +++ b/clang/lib/Index/IndexBody.cpp @@ -268,7 +268,7 @@ class BodyIndexer : public RecursiveASTVisitor { } return true; }; - bool IsPropCall = Containing && isa(Containing); + bool IsPropCall = isa_and_nonnull(Containing); // Implicit property message sends are not 'implicit'. if ((E->isImplicit() || IsPropCall) && !(IsPropCall && diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp index 8af4a97d00cb82..f085b943716442 100644 --- a/clang/lib/Lex/PPMacroExpansion.cpp +++ b/clang/lib/Lex/PPMacroExpansion.cpp @@ -226,7 +226,7 @@ void Preprocessor::updateModuleMacroInfo(const IdentifierInfo *II, bool IsSystemMacro = true; bool IsAmbiguous = false; if (auto *MD = Info.MD) { - while (MD && isa(MD)) + while (isa_and_nonnull(MD)) MD = MD->getPrevious(); if (auto *DMD = dyn_cast_or_null(MD)) { MI = DMD->getInfo(); diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index b9d0b59ef1db73..0f604c61fa3af9 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -442,7 +442,7 @@ static ControlFlowKind CheckFallThrough(AnalysisDeclContext &AC) { if (!live[B->getBlockID()]) { if (B->pred_begin() == B->pred_end()) { const Stmt *Term = B->getTerminatorStmt(); - if (Term && isa(Term)) + if (isa_and_nonnull(Term)) // When not adding EH edges from calls, catch clauses // can otherwise seem dead. Avoid noting them as dead. count += reachable_code::ScanReachableFromBlock(B, live); @@ -1100,7 +1100,7 @@ namespace { // issue a warn_fallthrough_attr_unreachable for them. for (const auto *B : *Cfg) { const Stmt *L = B->getLabel(); - if (L && isa(L) && ReachableBlocks.insert(B).second) + if (isa_and_nonnull(L) && ReachableBlocks.insert(B).second) BlockQueue.push_back(B); } @@ -1128,7 +1128,7 @@ namespace { if (!P) continue; const Stmt *Term = P->getTerminatorStmt(); - if (Term && isa(Term)) + if (isa_and_nonnull(Term)) continue; // Switch statement, good. const SwitchCase *SW = dyn_cast_or_null(P->getLabel()); @@ -1327,7 +1327,7 @@ static void DiagnoseSwitchLabelsFallthrough(Sema &S, AnalysisDeclContext &AC, B = *B->succ_begin(); Term = B->getTerminatorStmt(); } - if (!(B->empty() && Term && isa(Term))) { + if (!(B->empty() && isa_and_nonnull(Term))) { Preprocessor &PP = S.getPreprocessor(); StringRef AnnotationSpelling = getFallthroughAttrSpelling(PP, L); SmallString<64> TextToInsert(AnnotationSpelling); diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp index c405fbc0aa421b..da88b6cae6e361 100644 --- a/clang/lib/Sema/SemaCXXScopeSpec.cpp +++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp @@ -974,7 +974,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, R.setBegin(SS.getRange().getBegin()); Diag(CCLoc, diag::err_non_type_template_in_nested_name_specifier) - << (TD && isa(TD)) << Template << R; + << isa_and_nonnull(TD) << Template << R; NoteAllFoundTemplates(Template); return true; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 300af02239779f..07cd0727eb3f4a 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3839,11 +3839,11 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, if (CallType != VariadicDoesNotApply && (!FD || FD->getBuiltinID() != Builtin::BI__noop)) { unsigned NumParams = Proto ? Proto->getNumParams() - : FDecl && isa(FDecl) - ? cast(FDecl)->getNumParams() - : FDecl && isa(FDecl) - ? cast(FDecl)->param_size() - : 0; + : isa_and_nonnull(FDecl) + ? cast(FDecl)->getNumParams() + : isa_and_nonnull(FDecl) + ? cast(FDecl)->param_size() + : 0; for (unsigned ArgIdx = NumParams; ArgIdx < Args.size(); ++ArgIdx) { // Args[ArgIdx] can be null in malformed code. diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index cf461a68d55263..f3af8dee6b090c 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -2074,7 +2074,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal, if (DirectInitRange.isValid()) { assert(Initializer && "Have parens but no initializer."); InitStyle = CXXNewInitializationStyle::Parens; - } else if (Initializer && isa(Initializer)) + } else if (isa_and_nonnull(Initializer)) InitStyle = CXXNewInitializationStyle::Braces; else { assert((!Initializer || isa(Initializer) || @@ -3823,7 +3823,7 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal, // Otherwise, the usual operator delete[] should be the // function we just found. - else if (OperatorDelete && isa(OperatorDelete)) + else if (isa_and_nonnull(OperatorDelete)) UsualArrayDeleteWantsSize = UsualDeallocFnInfo(*this, DeclAccessPair::make(OperatorDelete, AS_public)) @@ -8595,7 +8595,7 @@ static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures( assert(S.CurContext->isDependentContext()); #ifndef NDEBUG DeclContext *DC = S.CurContext; - while (DC && isa(DC)) + while (isa_and_nonnull(DC)) DC = DC->getParent(); assert( CurrentLSI->CallOperator == DC && @@ -9172,7 +9172,7 @@ ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC, // - Teach the handful of places that iterate over FunctionScopes to // stop at the outermost enclosing lexical scope." DeclContext *DC = CurContext; - while (DC && isa(DC)) + while (isa_and_nonnull(DC)) DC = DC->getParent(); const bool IsInLambdaDeclContext = isLambdaCallOperator(DC); if (IsInLambdaDeclContext && CurrentLSI && diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index ed8b226a6b39f5..7244f3ef4e829e 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -2194,7 +2194,7 @@ void InitListChecker::CheckStructUnionTypes( // Designated inits always initialize fields, so if we see one, all // remaining base classes have no explicit initializer. - if (Init && isa(Init)) + if (isa_and_nonnull(Init)) Init = nullptr; // C++ [over.match.class.deduct]p1.6: @@ -6350,7 +6350,7 @@ void InitializationSequence::InitializeFrom(Sema &S, // class member of array type from a parenthesized initializer list. else if (S.getLangOpts().CPlusPlus && Entity.getKind() == InitializedEntity::EK_Member && - Initializer && isa(Initializer)) { + isa_and_nonnull(Initializer)) { TryListInitialization(S, Entity, Kind, cast(Initializer), *this, TreatUnavailableAsInvalid); AddParenthesizedArrayInitStep(DestType); @@ -8793,7 +8793,7 @@ ExprResult InitializationSequence::Perform(Sema &S, // constant expressions here in order to perform narrowing checks =( EnterExpressionEvaluationContext Evaluated( S, EnterExpressionEvaluationContext::InitList, - CurInit.get() && isa(CurInit.get())); + isa_and_nonnull(CurInit.get())); // C++ [class.abstract]p2: // no objects of an abstract class can be created except as subobjects diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 57465d4a77ac29..411e9af26f2b7b 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3701,7 +3701,7 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD, if (isLambdaConversionOperator(FD)) return false; - if (RetExpr && isa(RetExpr)) { + if (isa_and_nonnull(RetExpr)) { // If the deduction is for a return statement and the initializer is // a braced-init-list, the program is ill-formed. Diag(RetExpr->getExprLoc(), diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 40a759ea330de4..a032e3ec6f6353 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1936,7 +1936,7 @@ DeclResult Sema::CheckClassTemplate( // We may have found the injected-class-name of a class template, // class template partial specialization, or class template specialization. // In these cases, grab the template that is being defined or specialized. - if (!PrevClassTemplate && PrevDecl && isa(PrevDecl) && + if (!PrevClassTemplate && isa_and_nonnull(PrevDecl) && cast(PrevDecl)->isInjectedClassName()) { PrevDecl = cast(PrevDecl->getDeclContext()); PrevClassTemplate From 39cf8803391e695c7ec3a85fc1b2c4622f52b481 Mon Sep 17 00:00:00 2001 From: PiJoules <6019989+PiJoules@users.noreply.github.com> Date: Mon, 10 Jun 2024 20:19:24 -0700 Subject: [PATCH 13/82] [libc][stdlib] Change old unsigned short variables to size_t (#95065) They were assigned from calls to find_chunk_ptr_for_size which return size_t now. --- libc/src/stdlib/freelist.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/stdlib/freelist.h b/libc/src/stdlib/freelist.h index 20b4977835bef8..c01ed6eddb7d46 100644 --- a/libc/src/stdlib/freelist.h +++ b/libc/src/stdlib/freelist.h @@ -99,7 +99,7 @@ bool FreeList::add_chunk(span chunk) { aliased.bytes = chunk.data(); - unsigned short chunk_ptr = find_chunk_ptr_for_size(chunk.size(), false); + size_t chunk_ptr = find_chunk_ptr_for_size(chunk.size(), false); // Add it to the correct list. aliased.node->size = chunk.size(); @@ -114,7 +114,7 @@ span FreeList::find_chunk(size_t size) const { if (size == 0) return span(); - unsigned short chunk_ptr = find_chunk_ptr_for_size(size, true); + size_t chunk_ptr = find_chunk_ptr_for_size(size, true); // Check that there's data. This catches the case where we run off the // end of the array @@ -144,7 +144,7 @@ span FreeList::find_chunk(size_t size) const { template bool FreeList::remove_chunk(span chunk) { - unsigned short chunk_ptr = find_chunk_ptr_for_size(chunk.size(), true); + size_t chunk_ptr = find_chunk_ptr_for_size(chunk.size(), true); // Walk that list, finding the chunk. union { From 1934208132e3a084c5952656ae29816958b8207c Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Mon, 10 Jun 2024 20:33:38 -0700 Subject: [PATCH 14/82] [lldb] NFC add comments and test case for ObjectFileMachO delay-init (#95067) Add comments and a test for delay-init libraries on macOS. I originally added the support in 954d00e87cdd77d0e9e367be52e62340467bd779 a month ago, but without these additional clarifications. rdar://126885033 --- .../ObjectFile/Mach-O/ObjectFileMachO.cpp | 8 +++ .../API/macosx/delay-init-dependency/Makefile | 11 ++++ .../TestDelayInitDependency.py | 62 +++++++++++++++++++ .../API/macosx/delay-init-dependency/foo.c | 1 + .../API/macosx/delay-init-dependency/main.c | 9 +++ 5 files changed, 91 insertions(+) create mode 100644 lldb/test/API/macosx/delay-init-dependency/Makefile create mode 100644 lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py create mode 100644 lldb/test/API/macosx/delay-init-dependency/foo.c create mode 100644 lldb/test/API/macosx/delay-init-dependency/main.c diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index 4dd23bb1e4dbec..2979bf69bf762a 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -5140,12 +5140,20 @@ uint32_t ObjectFileMachO::GetDependentModules(FileSpecList &files) { case LC_LOADFVMLIB: case LC_LOAD_UPWARD_DYLIB: { uint32_t name_offset = cmd_offset + m_data.GetU32(&offset); + // For LC_LOAD_DYLIB there is an alternate encoding + // which adds a uint32_t `flags` field for `DYLD_USE_*` + // flags. This can be detected by a timestamp field with + // the `DYLIB_USE_MARKER` constant value. bool is_delayed_init = false; uint32_t use_command_marker = m_data.GetU32(&offset); if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) { offset += 4; /* uint32_t current_version */ offset += 4; /* uint32_t compat_version */ uint32_t flags = m_data.GetU32(&offset); + // If this LC_LOAD_DYLIB is marked delay-init, + // don't report it as a dependent library -- it + // may be loaded in the process at some point, + // but will most likely not be load at launch. if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */) is_delayed_init = true; } diff --git a/lldb/test/API/macosx/delay-init-dependency/Makefile b/lldb/test/API/macosx/delay-init-dependency/Makefile new file mode 100644 index 00000000000000..246ea0f34e1a1c --- /dev/null +++ b/lldb/test/API/macosx/delay-init-dependency/Makefile @@ -0,0 +1,11 @@ +C_SOURCES := main.c +LD_EXTRAS := -L. -Wl,-delay_library,libfoo.dylib + +.PHONY: build-libfoo +all: build-libfoo a.out + +include Makefile.rules + +build-libfoo: foo.c + $(MAKE) -f $(MAKEFILE_RULES) \ + DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES diff --git a/lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py b/lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py new file mode 100644 index 00000000000000..44ed2b1d21f186 --- /dev/null +++ b/lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py @@ -0,0 +1,62 @@ +"""Test binaries with delay-init dependencies.""" + +import subprocess +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestDelayInitDependencies(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @skipUnlessDarwin + def test_delay_init_dependency(self): + TestBase.setUp(self) + out = subprocess.run( + ["xcrun", "ld", "-delay_library"], + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + if "delay_library missing" not in out.stderr: + self.skipTest( + "Skipped because the linker doesn't know about -delay_library" + ) + self.build() + main_source = "main.c" + exe = self.getBuildArtifact("a.out") + lib = self.getBuildArtifact("libfoo.dylib") + + target = self.dbg.CreateTarget(exe) + self.assertTrue(target, VALID_TARGET) + + # libfoo.dylib should not be in the target pre-execution + for m in target.modules: + self.assertNotEqual(m.GetFileSpec().GetFilename(), "libfoo.dylib") + + # This run without arguments will not load libfoo.dylib + li = lldb.SBLaunchInfo([]) + li.SetWorkingDirectory(self.getBuildDir()) + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "// break here", lldb.SBFileSpec("main.c"), li + ) + for m in target.modules: + self.assertNotEqual(m.GetFileSpec().GetFilename(), "libfoo.dylib") + + process.Kill() + self.dbg.DeleteTarget(target) + + # This run with one argument will load libfoo.dylib + li = lldb.SBLaunchInfo([]) + li.SetWorkingDirectory(self.getBuildDir()) + li.SetArguments(["one-argument"], True) + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "// break here", lldb.SBFileSpec("main.c"), li + ) + + found_libfoo = False + for m in target.modules: + if m.GetFileSpec().GetFilename() == "libfoo.dylib": + found_libfoo = True + self.assertTrue(found_libfoo) diff --git a/lldb/test/API/macosx/delay-init-dependency/foo.c b/lldb/test/API/macosx/delay-init-dependency/foo.c new file mode 100644 index 00000000000000..de1cbc4c4648a1 --- /dev/null +++ b/lldb/test/API/macosx/delay-init-dependency/foo.c @@ -0,0 +1 @@ +int foo() { return 5; } diff --git a/lldb/test/API/macosx/delay-init-dependency/main.c b/lldb/test/API/macosx/delay-init-dependency/main.c new file mode 100644 index 00000000000000..57d251e6b2abe4 --- /dev/null +++ b/lldb/test/API/macosx/delay-init-dependency/main.c @@ -0,0 +1,9 @@ +int foo(); +int main(int argc, char **argv) { + int retval = 0; + // Only call foo() if one argument is passed + if (argc == 2) + retval = foo(); + + return retval; // break here +} From 6ffdcfa7fcb32192104b9742b02489395c4c6ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 10 Jun 2024 21:05:39 -0700 Subject: [PATCH 15/82] [flang] Lower REDUCE intrinsic with DIM argument (#94771) This is a follow up patch to #94652 and handles the lowering of the reduce intrinsic with DIM argument and non scalar result. --- .../Optimizer/Builder/Runtime/Reduction.h | 7 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 12 +- .../Optimizer/Builder/Runtime/Reduction.cpp | 204 ++++++++++++++++ flang/test/Lower/Intrinsics/reduce.f90 | 221 ++++++++++++++++++ 4 files changed, 443 insertions(+), 1 deletion(-) diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h b/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h index 27652208b524ee..fedf453a6dc8de 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h @@ -240,6 +240,13 @@ mlir::Value genReduce(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value maskBox, mlir::Value identity, mlir::Value ordered); +/// Generate call to `Reduce` intrinsic runtime routine. This is the version +/// that takes arrays of any rank with a dim argument specified. +void genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value arrayBox, mlir::Value operation, mlir::Value dim, + mlir::Value maskBox, mlir::Value identity, + mlir::Value ordered, mlir::Value resultBox); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_REDUCTION_H diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6101730ce17280..c3ef96956be1c5 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -5790,7 +5790,17 @@ IntrinsicLibrary::genReduce(mlir::Type resultType, return fir::runtime::genReduce(builder, loc, array, operation, mask, identity, ordered); } - TODO(loc, "reduce with array result"); + // Handle cases that have an array result. + // Create mutable fir.box to be passed to the runtime for the result. + mlir::Type resultArrayType = builder.getVarLenSeqTy(resultType, rank - 1); + fir::MutableBoxValue resultMutableBox = + fir::factory::createTempMutableBox(builder, loc, resultArrayType); + mlir::Value resultIrBox = + fir::factory::getMutableIRBox(builder, loc, resultMutableBox); + mlir::Value dim = fir::getBase(args[2]); + fir::runtime::genReduceDim(builder, loc, array, operation, dim, mask, + identity, ordered, resultIrBox); + return readAndAddCleanUp(resultMutableBox, resultType, "REDUCE"); } // REPEAT diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp index 0a280816ffcc83..4b086a98de47b2 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp @@ -505,6 +505,50 @@ struct ForcedReduceReal16 { } }; +/// Placeholder for DIM real*10 version of Reduce Intrinsic +struct ForcedReduceReal10Dim { + static constexpr const char *name = + ExpandAndQuoteKey(RTNAME(ReduceReal10Dim)); + static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() { + return [](mlir::MLIRContext *ctx) { + auto ty = mlir::FloatType::getF80(ctx); + auto boxTy = + fir::runtime::getModel()(ctx); + auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty); + auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8)); + auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int)); + auto refTy = fir::ReferenceType::get(ty); + auto refBoxTy = fir::ReferenceType::get(boxTy); + auto i1Ty = mlir::IntegerType::get(ctx, 1); + return mlir::FunctionType::get( + ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty}, + {}); + }; + } +}; + +/// Placeholder for DIM real*16 version of Reduce Intrinsic +struct ForcedReduceReal16Dim { + static constexpr const char *name = + ExpandAndQuoteKey(RTNAME(ReduceReal16Dim)); + static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() { + return [](mlir::MLIRContext *ctx) { + auto ty = mlir::FloatType::getF128(ctx); + auto boxTy = + fir::runtime::getModel()(ctx); + auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty); + auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8)); + auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int)); + auto refTy = fir::ReferenceType::get(ty); + auto refBoxTy = fir::ReferenceType::get(boxTy); + auto i1Ty = mlir::IntegerType::get(ctx, 1); + return mlir::FunctionType::get( + ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty}, + {}); + }; + } +}; + /// Placeholder for integer*16 version of Reduce Intrinsic struct ForcedReduceInteger16 { static constexpr const char *name = @@ -525,6 +569,28 @@ struct ForcedReduceInteger16 { } }; +/// Placeholder for DIM integer*16 version of Reduce Intrinsic +struct ForcedReduceInteger16Dim { + static constexpr const char *name = + ExpandAndQuoteKey(RTNAME(ReduceInteger16Dim)); + static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() { + return [](mlir::MLIRContext *ctx) { + auto ty = mlir::IntegerType::get(ctx, 128); + auto boxTy = + fir::runtime::getModel()(ctx); + auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty); + auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8)); + auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int)); + auto refTy = fir::ReferenceType::get(ty); + auto refBoxTy = fir::ReferenceType::get(boxTy); + auto i1Ty = mlir::IntegerType::get(ctx, 1); + return mlir::FunctionType::get( + ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty}, + {}); + }; + } +}; + /// Placeholder for complex(10) version of Reduce Intrinsic struct ForcedReduceComplex10 { static constexpr const char *name = @@ -546,6 +612,28 @@ struct ForcedReduceComplex10 { } }; +/// Placeholder for Dim complex(10) version of Reduce Intrinsic +struct ForcedReduceComplex10Dim { + static constexpr const char *name = + ExpandAndQuoteKey(RTNAME(CppReduceComplex10Dim)); + static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() { + return [](mlir::MLIRContext *ctx) { + auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx)); + auto boxTy = + fir::runtime::getModel()(ctx); + auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty); + auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8)); + auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int)); + auto refTy = fir::ReferenceType::get(ty); + auto refBoxTy = fir::ReferenceType::get(boxTy); + auto i1Ty = mlir::IntegerType::get(ctx, 1); + return mlir::FunctionType::get( + ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty}, + {}); + }; + } +}; + /// Placeholder for complex(16) version of Reduce Intrinsic struct ForcedReduceComplex16 { static constexpr const char *name = @@ -567,6 +655,28 @@ struct ForcedReduceComplex16 { } }; +/// Placeholder for Dim complex(16) version of Reduce Intrinsic +struct ForcedReduceComplex16Dim { + static constexpr const char *name = + ExpandAndQuoteKey(RTNAME(CppReduceComplex16Dim)); + static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() { + return [](mlir::MLIRContext *ctx) { + auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx)); + auto boxTy = + fir::runtime::getModel()(ctx); + auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty); + auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8)); + auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int)); + auto refTy = fir::ReferenceType::get(ty); + auto refBoxTy = fir::ReferenceType::get(boxTy); + auto i1Ty = mlir::IntegerType::get(ctx, 1); + return mlir::FunctionType::get( + ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty}, + {}); + }; + } +}; + /// Generate call to specialized runtime function that takes a mask and /// dim argument. The All, Any, and Count intrinsics use this pattern. template @@ -1461,3 +1571,97 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder, maskBox, identity, ordered); return builder.create(loc, func, args).getResult(0); } + +void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value arrayBox, mlir::Value operation, + mlir::Value dim, mlir::Value maskBox, + mlir::Value identity, mlir::Value ordered, + mlir::Value resultBox) { + mlir::func::FuncOp func; + auto ty = arrayBox.getType(); + auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty); + auto eleTy = mlir::cast(arrTy).getEleTy(); + + mlir::MLIRContext *ctx = builder.getContext(); + fir::factory::CharacterExprHelper charHelper{builder, loc}; + + if (eleTy.isF16()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isBF16()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isF32()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isF64()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isF80()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isF128()) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1))) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2))) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4))) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8))) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16))) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy == fir::ComplexType::get(ctx, 2)) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (eleTy == fir::ComplexType::get(ctx, 3)) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (eleTy == fir::ComplexType::get(ctx, 4)) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (eleTy == fir::ComplexType::get(ctx, 8)) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (eleTy == fir::ComplexType::get(ctx, 10)) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy == fir::ComplexType::get(ctx, 16)) + func = fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy == fir::LogicalType::get(ctx, 1)) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy == fir::LogicalType::get(ctx, 2)) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy == fir::LogicalType::get(ctx, 4)) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (eleTy == fir::LogicalType::get(ctx, 8)) + func = + fir::runtime::getRuntimeFunc(loc, builder); + else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 1) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 2) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 4) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else if (fir::isa_derived(eleTy)) + func = fir::runtime::getRuntimeFunc(loc, + builder); + else + fir::intrinsicTypeTODO(builder, eleTy, loc, "REDUCE"); + + auto fTy = func.getFunctionType(); + auto sourceFile = fir::factory::locationToFilename(builder, loc); + + auto sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(4)); + auto opAddr = builder.create(loc, fTy.getInput(2), operation); + auto args = fir::runtime::createArguments( + builder, loc, fTy, resultBox, arrayBox, opAddr, sourceFile, sourceLine, + dim, maskBox, identity, ordered); + builder.create(loc, func, args); +} diff --git a/flang/test/Lower/Intrinsics/reduce.f90 b/flang/test/Lower/Intrinsics/reduce.f90 index 36900abaa79f8c..842e626d7cc397 100644 --- a/flang/test/Lower/Intrinsics/reduce.f90 +++ b/flang/test/Lower/Intrinsics/reduce.f90 @@ -392,4 +392,225 @@ subroutine testtype(a) ! CHECK: fir.call @_FortranAReduceDerivedType +subroutine integer1dim(a, id) + integer(1), intent(in) :: a(:,:) + integer(1), allocatable :: res(:) + + res = reduce(a, red_int1, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceInteger1Dim + +subroutine integer2dim(a, id) + integer(2), intent(in) :: a(:,:) + integer(2), allocatable :: res(:) + + res = reduce(a, red_int2, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceInteger2Dim + +subroutine integer4dim(a, id) + integer(4), intent(in) :: a(:,:) + integer(4), allocatable :: res(:) + + res = reduce(a, red_int4, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceInteger4Dim + +subroutine integer8dim(a, id) + integer(8), intent(in) :: a(:,:) + integer(8), allocatable :: res(:) + + res = reduce(a, red_int8, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceInteger8Dim + +subroutine integer16dim(a, id) + integer(16), intent(in) :: a(:,:) + integer(16), allocatable :: res(:) + + res = reduce(a, red_int16, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceInteger16Dim + +subroutine real2dim(a, id) + real(2), intent(in) :: a(:,:) + real(2), allocatable :: res(:) + + res = reduce(a, red_real2, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceReal2Dim + +subroutine real3dim(a, id) + real(3), intent(in) :: a(:,:) + real(3), allocatable :: res(:) + + res = reduce(a, red_real3, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceReal3Dim + +subroutine real4dim(a, id) + real(4), intent(in) :: a(:,:) + real(4), allocatable :: res(:) + + res = reduce(a, red_real4, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceReal4Dim + +subroutine real8dim(a, id) + real(8), intent(in) :: a(:,:) + real(8), allocatable :: res(:) + + res = reduce(a, red_real8, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceReal8Dim + +subroutine real10dim(a, id) + real(10), intent(in) :: a(:,:) + real(10), allocatable :: res(:) + + res = reduce(a, red_real10, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceReal10Dim + +subroutine real16dim(a, id) + real(16), intent(in) :: a(:,:) + real(16), allocatable :: res(:) + + res = reduce(a, red_real16, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceReal16Dim + +subroutine complex2dim(a, id) + complex(2), intent(in) :: a(:,:) + complex(2), allocatable :: res(:) + + res = reduce(a, red_complex2, 2) +end subroutine + +! CHECK: fir.call @_FortranACppReduceComplex2Dim + +subroutine complex3dim(a, id) + complex(3), intent(in) :: a(:,:) + complex(3), allocatable :: res(:) + + res = reduce(a, red_complex3, 2) +end subroutine + +! CHECK: fir.call @_FortranACppReduceComplex3Dim + +subroutine complex4dim(a, id) + complex(4), intent(in) :: a(:,:) + complex(4), allocatable :: res(:) + + res = reduce(a, red_complex4, 2) +end subroutine + +! CHECK: fir.call @_FortranACppReduceComplex4Dim + +subroutine complex8dim(a, id) + complex(8), intent(in) :: a(:,:) + complex(8), allocatable :: res(:) + + res = reduce(a, red_complex8, 2) +end subroutine + +! CHECK: fir.call @_FortranACppReduceComplex8Dim + +subroutine complex10dim(a, id) + complex(10), intent(in) :: a(:,:) + complex(10), allocatable :: res(:) + + res = reduce(a, red_complex10, 2) +end subroutine + +! CHECK: fir.call @_FortranACppReduceComplex10Dim + +subroutine complex16dim(a, id) + complex(16), intent(in) :: a(:,:) + complex(16), allocatable :: res(:) + + res = reduce(a, red_complex16, 2) +end subroutine + +! CHECK: fir.call @_FortranACppReduceComplex16Dim + +subroutine logical1dim(a, id) + logical(1), intent(in) :: a(:,:) + logical(1), allocatable :: res(:) + + res = reduce(a, red_log1, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceLogical1Dim + +subroutine logical2dim(a, id) + logical(2), intent(in) :: a(:,:) + logical(2), allocatable :: res(:) + + res = reduce(a, red_log2, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceLogical2Dim + +subroutine logical4dim(a, id) + logical(4), intent(in) :: a(:,:) + logical(4), allocatable :: res(:) + + res = reduce(a, red_log4, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceLogical4Dim + +subroutine logical8dim(a, id) + logical(8), intent(in) :: a(:,:) + logical(8), allocatable :: res(:) + + res = reduce(a, red_log8, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceLogical8Dim + +subroutine testtypeDim(a) + type(t1), intent(in) :: a(:,:) + type(t1), allocatable :: res(:) + res = reduce(a, red_type, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceDerivedTypeDim + +subroutine char1dim(a) + character(1), intent(in) :: a(:, :) + character(1), allocatable :: res(:) + res = reduce(a, red_char1, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceCharacter1Dim + +subroutine char2dim(a) + character(kind=2), intent(in) :: a(:, :) + character(kind=2), allocatable :: res(:) + res = reduce(a, red_char2, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceCharacter2Dim + +subroutine char4dim(a) + character(kind=4), intent(in) :: a(:, :) + character(kind=4), allocatable :: res(:) + res = reduce(a, red_char4, 2) +end subroutine + +! CHECK: fir.call @_FortranAReduceCharacter4Dim + end module From 690480f5883a2fe9377fd0eeae21d0825e1b3661 Mon Sep 17 00:00:00 2001 From: Daniil Kovalev Date: Tue, 11 Jun 2024 07:28:42 +0300 Subject: [PATCH 16/82] [PAC][AArch64] Lower ptrauth constants in data (#94240) Lower global references to ptrauth constants into `@AUTH` `MCExpr`'s. The logic is common for MachO and ELF - test both. --------- Co-authored-by: Ahmed Bougacha --- llvm/include/llvm/CodeGen/AsmPrinter.h | 5 + llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 3 + llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 48 +++++ llvm/test/CodeGen/AArch64/ptrauth-reloc.ll | 176 ++++++++++++++++++ 4 files changed, 232 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/ptrauth-reloc.ll diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 81c3e4be95e9ff..011f8c6534b6a7 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -38,6 +38,7 @@ class BasicBlock; class BlockAddress; class Constant; class ConstantArray; +class ConstantPtrAuth; class DataLayout; class DIE; class DIEAbbrev; @@ -585,6 +586,10 @@ class AsmPrinter : public MachineFunctionPass { emitGlobalConstant(DL, CV); } + virtual const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) { + report_fatal_error("ptrauth constant lowering not implemented"); + } + /// Return true if the basic block has exactly one predecessor and the control /// transfer mechanism between the predecessor and this block is a /// fall-through. diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index e8bab26907b7e1..2943b270cd5df1 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3181,6 +3181,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { if (const ConstantInt *CI = dyn_cast(CV)) return MCConstantExpr::create(CI->getZExtValue(), Ctx); + if (const ConstantPtrAuth *CPA = dyn_cast(CV)) + return lowerConstantPtrAuth(*CPA); + if (const GlobalValue *GV = dyn_cast(CV)) return MCSymbolRefExpr::create(getSymbol(GV), Ctx); diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 7da540f8ef8e50..da11539eab348f 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -90,6 +90,8 @@ class AArch64AsmPrinter : public AsmPrinter { return MCInstLowering.lowerOperand(MO, MCOp); } + const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) override; + void emitStartOfAsmFile(Module &M) override; void emitJumpTableInfo() override; std::tuplegetInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4); } +const MCExpr * +AArch64AsmPrinter::lowerConstantPtrAuth(const ConstantPtrAuth &CPA) { + MCContext &Ctx = OutContext; + + // Figure out the base symbol and the addend, if any. + APInt Offset(64, 0); + const Value *BaseGV = CPA.getPointer()->stripAndAccumulateConstantOffsets( + getDataLayout(), Offset, /*AllowNonInbounds=*/true); + + auto *BaseGVB = dyn_cast(BaseGV); + + // If we can't understand the referenced ConstantExpr, there's nothing + // else we can do: emit an error. + if (!BaseGVB) { + BaseGV->getContext().emitError( + "cannot resolve target base/addend of ptrauth constant"); + return nullptr; + } + + // If there is an addend, turn that into the appropriate MCExpr. + const MCExpr *Sym = MCSymbolRefExpr::create(getSymbol(BaseGVB), Ctx); + if (Offset.sgt(0)) + Sym = MCBinaryExpr::createAdd( + Sym, MCConstantExpr::create(Offset.getSExtValue(), Ctx), Ctx); + else if (Offset.slt(0)) + Sym = MCBinaryExpr::createSub( + Sym, MCConstantExpr::create((-Offset).getSExtValue(), Ctx), Ctx); + + uint64_t KeyID = CPA.getKey()->getZExtValue(); + // We later rely on valid KeyID value in AArch64PACKeyIDToString call from + // AArch64AuthMCExpr::printImpl, so fail fast. + if (KeyID > AArch64PACKey::LAST) + report_fatal_error("AArch64 PAC Key ID '" + Twine(KeyID) + + "' out of range [0, " + + Twine((unsigned)AArch64PACKey::LAST) + "]"); + + uint64_t Disc = CPA.getDiscriminator()->getZExtValue(); + if (!isUInt<16>(Disc)) + report_fatal_error("AArch64 PAC Discriminator '" + Twine(Disc) + + "' out of range [0, 0xFFFF]"); + + // Finally build the complete @AUTH expr. + return AArch64AuthMCExpr::create(Sym, Disc, AArch64PACKey::ID(KeyID), + CPA.hasAddressDiscriminator(), Ctx); +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" diff --git a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll new file mode 100644 index 00000000000000..b7304b957a0013 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll @@ -0,0 +1,176 @@ +; RUN: rm -rf %t && split-file %s %t && cd %t + +;--- ok.ll + +; RUN: llc < ok.ll -mtriple arm64e-apple-darwin \ +; RUN: | FileCheck %s --check-prefix=CHECK-MACHO +; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth \ +; RUN: | FileCheck %s --check-prefix=CHECK-ELF + +; RUN: llc < ok.ll -mtriple arm64e-apple-darwin \ +; RUN: -global-isel -verify-machineinstrs -global-isel-abort=1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-MACHO +; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth \ +; RUN: -global-isel -verify-machineinstrs -global-isel-abort=1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ELF + +@g = external global i32 + +@g_weak = extern_weak global i32 + +@g_strong_def = constant i32 42 + +; CHECK-ELF-LABEL: .globl g.ref.ia.0 +; CHECK-ELF-NEXT: .p2align 4 +; CHECK-ELF-NEXT: g.ref.ia.0: +; CHECK-ELF-NEXT: .xword 5 +; CHECK-ELF-NEXT: .xword g@AUTH(ia,0) +; CHECK-ELF-NEXT: .xword 6 + +; CHECK-MACHO-LABEL: .section __DATA,__const +; CHECK-MACHO-NEXT: .globl _g.ref.ia.0 +; CHECK-MACHO-NEXT: .p2align 4 +; CHECK-MACHO-NEXT: _g.ref.ia.0: +; CHECK-MACHO-NEXT: .quad 5 +; CHECK-MACHO-NEXT: .quad _g@AUTH(ia,0) +; CHECK-MACHO-NEXT: .quad 6 + +@g.ref.ia.0 = constant { i64, ptr, i64 } { i64 5, ptr ptrauth (ptr @g, i32 0), i64 6 } + +; CHECK-ELF-LABEL: .globl g.ref.ia.42 +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g.ref.ia.42: +; CHECK-ELF-NEXT: .xword g@AUTH(ia,42) + +; CHECK-MACHO-LABEL: .globl _g.ref.ia.42 +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g.ref.ia.42: +; CHECK-MACHO-NEXT: .quad _g@AUTH(ia,42) + +@g.ref.ia.42 = constant ptr ptrauth (ptr @g, i32 0, i64 42) + +; CHECK-ELF-LABEL: .globl g.ref.ib.0 +; CHECK-ELF-NEXT: .p2align 4 +; CHECK-ELF-NEXT: g.ref.ib.0: +; CHECK-ELF-NEXT: .xword 5 +; CHECK-ELF-NEXT: .xword g@AUTH(ib,0) +; CHECK-ELF-NEXT: .xword 6 + +; CHECK-MACHO-LABEL: .globl _g.ref.ib.0 +; CHECK-MACHO-NEXT: .p2align 4 +; CHECK-MACHO-NEXT: _g.ref.ib.0: +; CHECK-MACHO-NEXT: .quad 5 +; CHECK-MACHO-NEXT: .quad _g@AUTH(ib,0) +; CHECK-MACHO-NEXT: .quad 6 + +@g.ref.ib.0 = constant { i64, ptr, i64 } { i64 5, ptr ptrauth (ptr @g, i32 1, i64 0), i64 6 } + +; CHECK-ELF-LABEL: .globl g.ref.da.42.addr +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g.ref.da.42.addr: +; CHECK-ELF-NEXT: .xword g@AUTH(da,42,addr) + +; CHECK-MACHO-LABEL: .globl _g.ref.da.42.addr +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g.ref.da.42.addr: +; CHECK-MACHO-NEXT: .quad _g@AUTH(da,42,addr) + +@g.ref.da.42.addr = constant ptr ptrauth (ptr @g, i32 2, i64 42, ptr @g.ref.da.42.addr) + +; CHECK-ELF-LABEL: .globl g.offset.ref.da.0 +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g.offset.ref.da.0: +; CHECK-ELF-NEXT: .xword (g+16)@AUTH(da,0) + +; CHECK-MACHO-LABEL: .globl _g.offset.ref.da.0 +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g.offset.ref.da.0: +; CHECK-MACHO-NEXT: .quad (_g+16)@AUTH(da,0) + +@g.offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) + +; CHECK-ELF-LABEL: .globl g.big_offset.ref.da.0 +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g.big_offset.ref.da.0: +; CHECK-ELF-NEXT: .xword (g+2147549185)@AUTH(da,0) + +; CHECK-MACHO-LABEL: .globl _g.big_offset.ref.da.0 +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g.big_offset.ref.da.0: +; CHECK-MACHO-NEXT: .quad (_g+2147549185)@AUTH(da,0) + +@g.big_offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2) + +; CHECK-ELF-LABEL: .globl g.weird_ref.da.0 +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g.weird_ref.da.0: +; CHECK-ELF-NEXT: .xword (g+16)@AUTH(da,0) + +; CHECK-MACHO-LABEL: .globl _g.weird_ref.da.0 +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g.weird_ref.da.0: +; CHECK-MACHO-NEXT: .quad (_g+16)@AUTH(da,0) + +@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64) + +; CHECK-ELF-LABEL: .globl g_weak.ref.ia.42 +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g_weak.ref.ia.42: +; CHECK-ELF-NEXT: .xword g_weak@AUTH(ia,42) + +; CHECK-MACHO-LABEL: .globl _g_weak.ref.ia.42 +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g_weak.ref.ia.42: +; CHECK-MACHO-NEXT: .quad _g_weak@AUTH(ia,42) + +@g_weak.ref.ia.42 = constant ptr ptrauth (ptr @g_weak, i32 0, i64 42) + +; CHECK-ELF-LABEL: .globl g_strong_def.ref.da.0 +; CHECK-ELF-NEXT: .p2align 3 +; CHECK-ELF-NEXT: g_strong_def.ref.da.0: +; CHECK-ELF-NEXT: .xword g_strong_def@AUTH(da,0) + +; CHECK-MACHO-LABEL: .globl _g_strong_def.ref.da.0 +; CHECK-MACHO-NEXT: .p2align 3 +; CHECK-MACHO-NEXT: _g_strong_def.ref.da.0: +; CHECK-MACHO-NEXT: .quad _g_strong_def@AUTH(da,0) + +@g_strong_def.ref.da.0 = constant ptr ptrauth (ptr @g_strong_def, i32 2) + +;--- err-key.ll + +; RUN: not --crash llc < err-key.ll -mtriple arm64e-apple-darwin 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-KEY +; RUN: not --crash llc < err-key.ll -mtriple aarch64-elf -mattr=+pauth 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-KEY + +; RUN: not --crash llc < err-key.ll -mtriple arm64e-apple-darwin \ +; RUN: -global-isel -verify-machineinstrs -global-isel-abort=1 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-KEY +; RUN: not --crash llc < err-key.ll -mtriple aarch64-elf -mattr=+pauth \ +; RUN: -global-isel -verify-machineinstrs -global-isel-abort=1 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-KEY + +; CHECK-ERR-KEY: LLVM ERROR: AArch64 PAC Key ID '4' out of range [0, 3] + +@g = external global i32 +@g.ref.4.0 = constant ptr ptrauth (ptr @g, i32 4, i64 0) + +;--- err-disc.ll + +; RUN: not --crash llc < err-disc.ll -mtriple arm64e-apple-darwin 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-DISC +; RUN: not --crash llc < err-disc.ll -mtriple aarch64-elf -mattr=+pauth 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-DISC + +; RUN: not --crash llc < err-disc.ll -mtriple arm64e-apple-darwin \ +; RUN: -global-isel -verify-machineinstrs -global-isel-abort=1 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-DISC +; RUN: not --crash llc < err-disc.ll -mtriple aarch64-elf -mattr=+pauth \ +; RUN: -global-isel -verify-machineinstrs -global-isel-abort=1 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-ERR-DISC + +; CHECK-ERR-DISC: LLVM ERROR: AArch64 PAC Discriminator '65536' out of range [0, 0xFFFF] + +@g = external global i32 +@g.ref.ia.65536 = constant ptr ptrauth (ptr @g, i32 0, i64 65536) From 0bc33f41abf4174cb76b5099cffaf7820dec58e9 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 11 Jun 2024 06:30:13 +0200 Subject: [PATCH 17/82] =?UTF-8?q?[AMDGPU]=20Update=20tests=20for=20last-us?= =?UTF-8?q?e=20in=20global/scratch/flat/buffer=20load=E2=80=A6=20(#94975)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … instructions --- .../llvm.amdgcn.buffer.load-last-use.ll | 37 +++++++++++++++++++ .../AMDGPU/memory-legalizer-flat-lastuse.ll | 7 +--- .../AMDGPU/memory-legalizer-global-lastuse.ll | 7 +--- .../memory-legalizer-private-lastuse.ll | 7 +--- 4 files changed, 43 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll new file mode 100644 index 00000000000000..de484e3db18ab5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN +;RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN + +define float @raw_buffer_load(<4 x i32> inreg) { +; GCN-LABEL: raw_buffer_load: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: buffer_load_b32 v0, off, s[0:3], null th:TH_LOAD_LU +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: s_setpc_b64 s[30:31] +main_body: + %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 3) + ret float %data +} + +define float @struct_buffer_load(<4 x i32> inreg) { +; GCN-LABEL: struct_buffer_load: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_expcnt 0x0 +; GCN-NEXT: s_wait_samplecnt 0x0 +; GCN-NEXT: s_wait_bvhcnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_load_b32 v0, v0, s[0:3], null idxen th:TH_LOAD_LU +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: s_setpc_b64 s[30:31] +main_body: + %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 3) + ret float %data +} + diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index e7c6044b3fb6b0..fb40274cac1bac 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: @@ -107,6 +107,3 @@ entry: !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-CU: {{.*}} -; GFX12-WGP: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index c889c67a5ca37d..7a9cb992a0cd16 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: @@ -92,6 +92,3 @@ entry: } !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-CU: {{.*}} -; GFX12-WGP: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index 1f835349b12b0b..61cec731feb565 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_load_0: @@ -85,6 +85,3 @@ entry: !0 = !{i32 1} declare i32 @llvm.amdgcn.workitem.id.x() -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-CU: {{.*}} -; GFX12-WGP: {{.*}} From 48f8130a49aad715ff6d5136dad2447d41e9537b Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov <6532716+alexander-shaposhnikov@users.noreply.github.com> Date: Mon, 10 Jun 2024 22:14:26 -0700 Subject: [PATCH 18/82] [Clang][Sanitizers] Add numerical sanitizer (#93783) Add plumbing for the numerical sanitizer on Clang's side. --- clang/include/clang/Basic/Features.def | 1 + clang/include/clang/Basic/Sanitizers.def | 3 ++ clang/include/clang/Driver/SanitizerArgs.h | 3 ++ clang/lib/CodeGen/CGDeclCXX.cpp | 4 +++ clang/lib/CodeGen/CodeGenFunction.cpp | 2 ++ clang/lib/Driver/SanitizerArgs.cpp | 7 ++-- clang/lib/Driver/ToolChains/Darwin.cpp | 1 + clang/lib/Driver/ToolChains/Linux.cpp | 3 ++ .../sanitize-numerical-stability-attr.cpp | 34 +++++++++++++++++++ clang/test/Driver/fsanitize.c | 15 ++++++++ ..._feature_numerical_stability_sanitizer.cpp | 11 ++++++ 11 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/sanitize-numerical-stability-attr.cpp create mode 100644 clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def index b762e44e755ec4..53f410d3cb4bde 100644 --- a/clang/include/clang/Basic/Features.def +++ b/clang/include/clang/Basic/Features.def @@ -96,6 +96,7 @@ FEATURE(nullability, true) FEATURE(nullability_on_arrays, true) FEATURE(nullability_on_classes, true) FEATURE(nullability_nullable_result, true) +FEATURE(numerical_stability_sanitizer, LangOpts.Sanitize.has(SanitizerKind::NumericalStability)) FEATURE(memory_sanitizer, LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory)) diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def index b228ffd07ee745..bee35e9dca7c39 100644 --- a/clang/include/clang/Basic/Sanitizers.def +++ b/clang/include/clang/Basic/Sanitizers.def @@ -76,6 +76,9 @@ SANITIZER("fuzzer-no-link", FuzzerNoLink) // ThreadSanitizer SANITIZER("thread", Thread) +// Numerical stability sanitizer. +SANITIZER("numerical", NumericalStability) + // LeakSanitizer SANITIZER("leak", Leak) diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h index 07070ec4fc0653..47ef175302679f 100644 --- a/clang/include/clang/Driver/SanitizerArgs.h +++ b/clang/include/clang/Driver/SanitizerArgs.h @@ -103,6 +103,9 @@ class SanitizerArgs { bool needsCfiDiagRt() const; bool needsStatsRt() const { return Stats; } bool needsScudoRt() const { return Sanitizers.has(SanitizerKind::Scudo); } + bool needsNsanRt() const { + return Sanitizers.has(SanitizerKind::NumericalStability); + } bool hasMemTag() const { return hasMemtagHeap() || hasMemtagStack() || hasMemtagGlobals(); diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index b047279912f6b7..a88bb2af59fee0 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -476,6 +476,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction( !isInNoSanitizeList(SanitizerKind::Thread, Fn, Loc)) Fn->addFnAttr(llvm::Attribute::SanitizeThread); + if (getLangOpts().Sanitize.has(SanitizerKind::NumericalStability) && + !isInNoSanitizeList(SanitizerKind::NumericalStability, Fn, Loc)) + Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability); + if (getLangOpts().Sanitize.has(SanitizerKind::Memory) && !isInNoSanitizeList(SanitizerKind::Memory, Fn, Loc)) Fn->addFnAttr(llvm::Attribute::SanitizeMemory); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index f84b3b08220fd9..cea0d84c64bc47 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -818,6 +818,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, Fn->addFnAttr(llvm::Attribute::SanitizeMemTag); if (SanOpts.has(SanitizerKind::Thread)) Fn->addFnAttr(llvm::Attribute::SanitizeThread); + if (SanOpts.has(SanitizerKind::NumericalStability)) + Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability); if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory)) Fn->addFnAttr(llvm::Attribute::SanitizeMemory); } diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp index 273f215ca94a88..86825a6ccf7a1d 100644 --- a/clang/lib/Driver/SanitizerArgs.cpp +++ b/clang/lib/Driver/SanitizerArgs.cpp @@ -41,7 +41,8 @@ static const SanitizerMask NotAllowedWithExecuteOnly = SanitizerKind::Function | SanitizerKind::KCFI; static const SanitizerMask NeedsUnwindTables = SanitizerKind::Address | SanitizerKind::HWAddress | SanitizerKind::Thread | - SanitizerKind::Memory | SanitizerKind::DataFlow; + SanitizerKind::Memory | SanitizerKind::DataFlow | + SanitizerKind::NumericalStability; static const SanitizerMask SupportsCoverage = SanitizerKind::Address | SanitizerKind::HWAddress | SanitizerKind::KernelAddress | SanitizerKind::KernelHWAddress | @@ -53,7 +54,8 @@ static const SanitizerMask SupportsCoverage = SanitizerKind::DataFlow | SanitizerKind::Fuzzer | SanitizerKind::FuzzerNoLink | SanitizerKind::FloatDivideByZero | SanitizerKind::SafeStack | SanitizerKind::ShadowCallStack | - SanitizerKind::Thread | SanitizerKind::ObjCCast | SanitizerKind::KCFI; + SanitizerKind::Thread | SanitizerKind::ObjCCast | SanitizerKind::KCFI | + SanitizerKind::NumericalStability; static const SanitizerMask RecoverableByDefault = SanitizerKind::Undefined | SanitizerKind::Integer | SanitizerKind::ImplicitConversion | SanitizerKind::Nullability | @@ -175,6 +177,7 @@ static void addDefaultIgnorelists(const Driver &D, SanitizerMask Kinds, {"hwasan_ignorelist.txt", SanitizerKind::HWAddress}, {"memtag_ignorelist.txt", SanitizerKind::MemTag}, {"msan_ignorelist.txt", SanitizerKind::Memory}, + {"nsan_ignorelist.txt", SanitizerKind::NumericalStability}, {"tsan_ignorelist.txt", SanitizerKind::Thread}, {"dfsan_abilist.txt", SanitizerKind::DataFlow}, {"cfi_ignorelist.txt", SanitizerKind::CFI}, diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 593b403a1e3f05..ed5737915aa96b 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -3448,6 +3448,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const { Res |= SanitizerKind::PointerCompare; Res |= SanitizerKind::PointerSubtract; Res |= SanitizerKind::Leak; + Res |= SanitizerKind::NumericalStability; Res |= SanitizerKind::Fuzzer; Res |= SanitizerKind::FuzzerNoLink; Res |= SanitizerKind::ObjCCast; diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index db2c20d7b461d0..2c583ac724a2a2 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -826,6 +826,9 @@ SanitizerMask Linux::getSupportedSanitizers() const { if (IsX86_64 || IsAArch64) { Res |= SanitizerKind::KernelHWAddress; } + if (IsX86_64 || IsAArch64) + Res |= SanitizerKind::NumericalStability; + // Work around "Cannot represent a difference across sections". if (getTriple().getArch() == llvm::Triple::ppc64) Res &= ~SanitizerKind::Function; diff --git a/clang/test/CodeGen/sanitize-numerical-stability-attr.cpp b/clang/test/CodeGen/sanitize-numerical-stability-attr.cpp new file mode 100644 index 00000000000000..f51fb79bda6afd --- /dev/null +++ b/clang/test/CodeGen/sanitize-numerical-stability-attr.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck -check-prefix=WITHOUT %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fsanitize=numerical | FileCheck -check-prefix=NSAN %s +// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t +// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fsanitize=numerical -fsanitize-ignorelist=%t | FileCheck -check-prefix=BL %s + +// WITHOUT: NoNSAN3{{.*}}) [[NOATTR:#[0-9]+]] +// BL: NoNSAN3{{.*}}) [[NOATTR:#[0-9]+]] +// NSAN: NoNSAN3{{.*}}) [[NOATTR:#[0-9]+]] +__attribute__((no_sanitize("numerical"))) +int NoNSAN3(int *a) { return *a; } + +// WITHOUT: NSANOk{{.*}}) [[NOATTR]] +// BL: NSANOk{{.*}}) [[NOATTR]] +// NSAN: NSANOk{{.*}}) [[WITH:#[0-9]+]] +int NSANOk(int *a) { return *a; } + +// WITHOUT: TemplateNSANOk{{.*}}) [[NOATTR]] +// BL: TemplateNSANOk{{.*}}) [[NOATTR]] +// NSAN: TemplateNSANOk{{.*}}) [[WITH]] +template +int TemplateNSANOk() { return i; } + +// WITHOUT: TemplateNoNSAN{{.*}}) [[NOATTR]] +// BL: TemplateNoNSAN{{.*}}) [[NOATTR]] +// NSAN: TemplateNoNSAN{{.*}}) [[NOATTR]] +template +__attribute__((no_sanitize("numerical"))) +int TemplateNoNSAN() { return i; } + +int force_instance = TemplateNSANOk<42>() + TemplateNoNSAN<42>(); + +// WITHOUT: attributes [[NOATTR]] = { mustprogress noinline nounwind{{.*}} } +// BL: attributes [[NOATTR]] = { mustprogress noinline nounwind{{.*}} } +// NSAN: attributes [[WITH]] = { mustprogress noinline nounwind optnone sanitize_numerical_stability{{.*}} } diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c index 571f79a6e7f70d..ba64b3dcb11aa5 100644 --- a/clang/test/Driver/fsanitize.c +++ b/clang/test/Driver/fsanitize.c @@ -459,6 +459,21 @@ // CHECK-TSAN-MSAN-MSAN-DARWIN: unsupported option '-fsanitize=memory' for target 'x86_64-apple-darwin10' // CHECK-TSAN-MSAN-MSAN-DARWIN-NOT: unsupported option +// RUN: %clang --target=x86_64-linux-gnu -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-X86-64-LINUX +// CHECK-NSAN-X86-64-LINUX: "-fsanitize=numerical" + +// RUN: %clang --target=aarch64-unknown-linux-gnu -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-AARCH64-LINUX +// CHECK-NSAN-AARCH64-LINUX: "-fsanitize=numerical" + +// RUN: not %clang --target=mips-unknown-linux -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-MIPS-LINUX +// CHECK-NSAN-MIPS-LINUX: error: unsupported option '-fsanitize=numerical' for target 'mips-unknown-linux' + +// RUN: %clang --target=x86_64-apple-macos -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-X86-64-MACOS +// CHECK-NSAN-X86-64-MACOS: "-fsanitize=numerical" + +// RUN: %clang --target=arm64-apple-macos -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-ARM64-MACOS +// CHECK-NSAN-ARM64-MACOS: "-fsanitize=numerical" + // RUN: %clang --target=x86_64-apple-darwin -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-DARWIN // CHECK-TSAN-X86-64-DARWIN-NOT: unsupported option // RUN: %clang --target=x86_64-apple-macos -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-MACOS diff --git a/clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp b/clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp new file mode 100644 index 00000000000000..78884977322b8e --- /dev/null +++ b/clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -E -fsanitize=numerical %s -o - | FileCheck --check-prefix=CHECK-NSAN %s +// RUN: %clang_cc1 -E %s -o - | FileCheck --check-prefix=CHECK-NO-NSAN %s + +#if __has_feature(numerical_stability_sanitizer) +int NumericalStabilitySanitizerEnabled(); +#else +int NumericalStabilitySanitizerDisabled(); +#endif + +// CHECK-NSAN: NumericalStabilitySanitizerEnabled +// CHECK-NO-NSAN: NumericalStabilitySanitizerDisabled From 41c650e8208f7804eb5ecd8749d6b31b6e518bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 11 Jun 2024 09:03:41 +0300 Subject: [PATCH 19/82] [clang] Don't use -Wno-nested-anon-types on GCC (#95029) GCC usually doesn't warn about unrecognized -Wno- options, if no diagnostics are printed. However if some diagnostics are printed, it also mentions that there were unrecognized -Wno- options. Before 4feae05c6abda364a9295aecfa600d7d4e7dfeb6, we checked for whether -Wnested-anon-types was supported, and added the -Wno- form if the positive form of the option was supported. As of GCC 14, -Wnested-anon-types isn't supported, thus limit the use of the option to actual Clang (and still only while using the GCC compatible driver). This avoids unnecessary mentions about unrecognized -Wno- options when building with GCC. --- clang/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index f6d96258cd135b..c6496167d3828b 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -350,7 +350,9 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wno-long-long") endif () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" ) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" ) + endif () endif () # Determine HOST_LINK_VERSION on Darwin. From 607afa0b6375e4837fef298a798f5534e783d777 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Mon, 10 Jun 2024 23:06:06 -0700 Subject: [PATCH 20/82] Revert "[llvm][IR] Extend BranchWeightMetadata to track provenance of weights" (#95060) Reverts llvm/llvm-project#86609 This change causes compile-time regressions for stage2 builds (https://llvm-compile-time-tracker.com/compare.php?from=3254f31a66263ea9647c9547f1531c3123444fcd&to=c5978f1eb5eeca8610b9dfce1fcbf1f473911cd8&stat=instructions:u). It also introduced unintended changes to `.text` which should be addressed before relanding. --- .../attr-likelihood-if-vs-builtin-expect.cpp | 4 +- llvm/docs/BranchWeightMetadata.rst | 7 ---- llvm/include/llvm/IR/MDBuilder.h | 11 +---- llvm/include/llvm/IR/ProfDataUtils.h | 17 +------- llvm/lib/CodeGen/CodeGenPrepare.cpp | 3 +- llvm/lib/IR/Instruction.cpp | 19 ++------- llvm/lib/IR/Instructions.cpp | 6 +-- llvm/lib/IR/MDBuilder.cpp | 14 +++---- llvm/lib/IR/Metadata.cpp | 8 ++-- llvm/lib/IR/ProfDataUtils.cpp | 40 +++++-------------- llvm/lib/IR/Verifier.cpp | 9 ++--- llvm/lib/Transforms/IPO/SampleProfile.cpp | 7 ++-- .../ControlHeightReduction.cpp | 2 +- .../Instrumentation/IndirectCallPromotion.cpp | 3 +- .../Instrumentation/PGOInstrumentation.cpp | 5 +-- llvm/lib/Transforms/Scalar/JumpThreading.cpp | 4 +- .../Scalar/LowerExpectIntrinsic.cpp | 16 ++++---- llvm/lib/Transforms/Utils/Local.cpp | 2 +- llvm/lib/Transforms/Utils/LoopPeel.cpp | 4 +- .../Transforms/Utils/LoopRotationUtils.cpp | 4 +- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 23 +++++------ .../Transforms/Vectorize/LoopVectorize.cpp | 12 +++--- .../Transforms/LowerExpectIntrinsic/basic.ll | 8 ++-- .../expect-with-probability.ll | 8 ++-- .../LowerExpectIntrinsic/expect_nonboolean.ll | 5 ++- .../LowerExpectIntrinsic/phi_merge.ll | 4 +- .../Transforms/LowerExpectIntrinsic/phi_or.ll | 4 +- .../LowerExpectIntrinsic/phi_tern.ll | 2 +- .../LowerExpectIntrinsic/phi_unexpect.ll | 4 +- .../AArch64/hoist-runtime-checks.ll | 2 +- 30 files changed, 88 insertions(+), 169 deletions(-) diff --git a/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp b/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp index 81d93343565209..fb236aeb982e01 100644 --- a/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp +++ b/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp @@ -221,5 +221,5 @@ void tu2(int &i) { } } -// CHECK: [[BW_LIKELY]] = !{!"branch_weights", !"expected", i32 2000, i32 1} -// CHECK: [[BW_UNLIKELY]] = !{!"branch_weights", !"expected", i32 1, i32 2000} +// CHECK: [[BW_LIKELY]] = !{!"branch_weights", i32 2000, i32 1} +// CHECK: [[BW_UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000} diff --git a/llvm/docs/BranchWeightMetadata.rst b/llvm/docs/BranchWeightMetadata.rst index 62204753e29b06..522f37cdad4fc1 100644 --- a/llvm/docs/BranchWeightMetadata.rst +++ b/llvm/docs/BranchWeightMetadata.rst @@ -28,14 +28,11 @@ Supported Instructions Metadata is only assigned to the conditional branches. There are two extra operands for the true and the false branch. -We optionally track if the metadata was added by ``__builtin_expect`` or -``__builtin_expect_with_probability`` with an optional field ``!"expected"``. .. code-block:: none !0 = !{ !"branch_weights", - [ !"expected", ] i32 , i32 } @@ -50,7 +47,6 @@ is always case #0). !0 = !{ !"branch_weights", - [ !"expected", ] i32 [ , i32 ... ] } @@ -64,7 +60,6 @@ Branch weights are assigned to every destination. !0 = !{ !"branch_weights", - [ !"expected", ] i32 [ , i32 ... ] } @@ -80,7 +75,6 @@ block and entry counts which may not be accurate with sampling. !0 = !{ !"branch_weights", - [ !"expected", ] i32 } @@ -101,7 +95,6 @@ is used. !0 = !{ !"branch_weights", - [ !"expected", ] i32 [ , i32 ] } diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h index e02ec8f5a3d8bb..3265589b7c8dfa 100644 --- a/llvm/include/llvm/IR/MDBuilder.h +++ b/llvm/include/llvm/IR/MDBuilder.h @@ -59,11 +59,7 @@ class MDBuilder { //===------------------------------------------------------------------===// /// Return metadata containing two branch weights. - /// @param TrueWeight the weight of the true branch - /// @param FalseWeight the weight of the false branch - /// @param Do these weights come from __builtin_expect* - MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, - bool IsExpected = false); + MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight); /// Return metadata containing two branch weights, with significant bias /// towards `true` destination. @@ -74,10 +70,7 @@ class MDBuilder { MDNode *createUnlikelyBranchWeights(); /// Return metadata containing a number of branch weights. - /// @param Weights the weights of all the branches - /// @param Do these weights come from __builtin_expect* - MDNode *createBranchWeights(ArrayRef Weights, - bool IsExpected = false); + MDNode *createBranchWeights(ArrayRef Weights); /// Return metadata specifying that a branch or switch is unpredictable. MDNode *createUnpredictable(); diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h index 1d7c97d9be953e..88fbad4d6b9d82 100644 --- a/llvm/include/llvm/IR/ProfDataUtils.h +++ b/llvm/include/llvm/IR/ProfDataUtils.h @@ -55,17 +55,6 @@ MDNode *getBranchWeightMDNode(const Instruction &I); /// Nullptr otherwise. MDNode *getValidBranchWeightMDNode(const Instruction &I); -/// Check if Branch Weight Metadata has an "expected" field from an llvm.expect* -/// intrinsic -bool hasBranchWeightOrigin(const Instruction &I); - -/// Check if Branch Weight Metadata has an "expected" field from an llvm.expect* -/// intrinsic -bool hasBranchWeightOrigin(const MDNode *ProfileData); - -/// Return the offset to the first branch weight data -unsigned getBranchWeightOffset(const MDNode *ProfileData); - /// Extract branch weights from MD_prof metadata /// /// \param ProfileData A pointer to an MDNode. @@ -122,11 +111,7 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalWeights); /// Create a new `branch_weights` metadata node and add or overwrite /// a `prof` metadata reference to instruction `I`. -/// \param I the Instruction to set branch weights on. -/// \param Weights an array of weights to set on instruction I. -/// \param IsExpected were these weights added from an llvm.expect* intrinsic. -void setBranchWeights(Instruction &I, ArrayRef Weights, - bool IsExpected); +void setBranchWeights(Instruction &I, ArrayRef Weights); /// Scaling the profile data attached to 'I' using the ratio of S/T. void scaleProfData(Instruction &I, uint64_t S, uint64_t T); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 0e01080bd75cc9..339a1f1f2f002e 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8866,8 +8866,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) { scaleWeights(NewTrueWeight, NewFalseWeight); Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext()) - .createBranchWeights(TrueWeight, FalseWeight, - hasBranchWeightOrigin(*Br1))); + .createBranchWeights(TrueWeight, FalseWeight)); NewTrueWeight = TrueWeight; NewFalseWeight = 2 * FalseWeight; diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index aec927a8cf31d9..29272e627a1d1d 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -1268,23 +1268,12 @@ Instruction *Instruction::cloneImpl() const { void Instruction::swapProfMetadata() { MDNode *ProfileData = getBranchWeightMDNode(*this); - if (!ProfileData) - return; - unsigned FirstIdx = getBranchWeightOffset(ProfileData); - if (ProfileData->getNumOperands() != 2 + FirstIdx) + if (!ProfileData || ProfileData->getNumOperands() != 3) return; - unsigned SecondIdx = FirstIdx + 1; - SmallVector Ops; - // If there are more weights past the second, we can't swap them - if (ProfileData->getNumOperands() > SecondIdx + 1) - return; - for (unsigned Idx = 0; Idx < FirstIdx; ++Idx) { - Ops.push_back(ProfileData->getOperand(Idx)); - } - // Switch the order of the weights - Ops.push_back(ProfileData->getOperand(SecondIdx)); - Ops.push_back(ProfileData->getOperand(FirstIdx)); + // The first operand is the name. Fetch them backwards and build a new one. + Metadata *Ops[] = {ProfileData->getOperand(0), ProfileData->getOperand(2), + ProfileData->getOperand(1)}; setMetadata(LLVMContext::MD_prof, MDNode::get(ProfileData->getContext(), Ops)); } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index de369bd62a6179..1213f078d05eca 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -5199,11 +5199,7 @@ void SwitchInstProfUpdateWrapper::init() { if (!ProfileData) return; - // FIXME: This check belongs in ProfDataUtils. Its almost equivalent to - // getValidBranchWeightMDNode(), but the need to use llvm_unreachable - // makes them slightly different. - if (ProfileData->getNumOperands() != - SI.getNumSuccessors() + getBranchWeightOffset(ProfileData)) { + if (ProfileData->getNumOperands() != SI.getNumSuccessors() + 1) { llvm_unreachable("number of prof branch_weights metadata operands does " "not correspond to number of succesors"); } diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp index 000027754d13ec..bd68db3a6f9616 100644 --- a/llvm/lib/IR/MDBuilder.cpp +++ b/llvm/lib/IR/MDBuilder.cpp @@ -35,8 +35,8 @@ MDNode *MDBuilder::createFPMath(float Accuracy) { } MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight, - uint32_t FalseWeight, bool IsExpected) { - return createBranchWeights({TrueWeight, FalseWeight}, IsExpected); + uint32_t FalseWeight) { + return createBranchWeights({TrueWeight, FalseWeight}); } MDNode *MDBuilder::createLikelyBranchWeights() { @@ -49,19 +49,15 @@ MDNode *MDBuilder::createUnlikelyBranchWeights() { return createBranchWeights(1, (1U << 20) - 1); } -MDNode *MDBuilder::createBranchWeights(ArrayRef Weights, - bool IsExpected) { +MDNode *MDBuilder::createBranchWeights(ArrayRef Weights) { assert(Weights.size() >= 1 && "Need at least one branch weights!"); - unsigned int Offset = IsExpected ? 2 : 1; - SmallVector Vals(Weights.size() + Offset); + SmallVector Vals(Weights.size() + 1); Vals[0] = createString("branch_weights"); - if (IsExpected) - Vals[1] = createString("expected"); Type *Int32Ty = Type::getInt32Ty(Context); for (unsigned i = 0, e = Weights.size(); i != e; ++i) - Vals[i + Offset] = createConstant(ConstantInt::get(Int32Ty, Weights[i])); + Vals[i + 1] = createConstant(ConstantInt::get(Int32Ty, Weights[i])); return MDNode::get(Context, Vals); } diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 5f42ce22f72fec..b6c932495a145d 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1196,10 +1196,10 @@ MDNode *MDNode::mergeDirectCallProfMetadata(MDNode *A, MDNode *B, StringRef AProfName = AMDS->getString(); StringRef BProfName = BMDS->getString(); if (AProfName == "branch_weights" && BProfName == "branch_weights") { - ConstantInt *AInstrWeight = mdconst::dyn_extract( - A->getOperand(getBranchWeightOffset(A))); - ConstantInt *BInstrWeight = mdconst::dyn_extract( - B->getOperand(getBranchWeightOffset(B))); + ConstantInt *AInstrWeight = + mdconst::dyn_extract(A->getOperand(1)); + ConstantInt *BInstrWeight = + mdconst::dyn_extract(B->getOperand(1)); assert(AInstrWeight && BInstrWeight && "verified by LLVM verifier"); return MDNode::get(Ctx, {MDHelper.createString("branch_weights"), diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp index c4b1ed55de8a22..51e78dc5e6c008 100644 --- a/llvm/lib/IR/ProfDataUtils.cpp +++ b/llvm/lib/IR/ProfDataUtils.cpp @@ -40,6 +40,9 @@ namespace { // We maintain some constants here to ensure that we access the branch weights // correctly, and can change the behavior in the future if the layout changes +// The index at which the weights vector starts +constexpr unsigned WeightsIdx = 1; + // the minimum number of operands for MD_prof nodes with branch weights constexpr unsigned MinBWOps = 3; @@ -72,7 +75,6 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData, assert(isBranchWeightMD(ProfileData) && "wrong metadata"); unsigned NOps = ProfileData->getNumOperands(); - unsigned WeightsIdx = getBranchWeightOffset(ProfileData); assert(WeightsIdx < NOps && "Weights Index must be less than NOps."); Weights.resize(NOps - WeightsIdx); @@ -80,8 +82,8 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData, ConstantInt *Weight = mdconst::dyn_extract(ProfileData->getOperand(Idx)); assert(Weight && "Malformed branch_weight in MD_prof node"); - assert(Weight->getValue().getActiveBits() <= (sizeof(T) * 8) && - "Too many bits for MD_prof branch_weight"); + assert(Weight->getValue().getActiveBits() <= 32 && + "Too many bits for uint32_t"); Weights[Idx - WeightsIdx] = Weight->getZExtValue(); } } @@ -121,26 +123,6 @@ bool hasValidBranchWeightMD(const Instruction &I) { return getValidBranchWeightMDNode(I); } -bool hasBranchWeightOrigin(const Instruction &I) { - auto *ProfileData = I.getMetadata(LLVMContext::MD_prof); - return hasBranchWeightOrigin(ProfileData); -} - -bool hasBranchWeightOrigin(const MDNode *ProfileData) { - if (!isBranchWeightMD(ProfileData)) - return false; - auto *ProfDataName = dyn_cast(ProfileData->getOperand(1)); - // NOTE: if we ever have more types of branch weight provenance, - // we need to check the string value is "expected". For now, we - // supply a more generic API, and avoid the spurious comparisons. - assert(ProfDataName == nullptr || ProfDataName->getString() == "expected"); - return ProfDataName != nullptr; -} - -unsigned getBranchWeightOffset(const MDNode *ProfileData) { - return hasBranchWeightOrigin(ProfileData) ? 2 : 1; -} - MDNode *getBranchWeightMDNode(const Instruction &I) { auto *ProfileData = I.getMetadata(LLVMContext::MD_prof); if (!isBranchWeightMD(ProfileData)) @@ -150,9 +132,7 @@ MDNode *getBranchWeightMDNode(const Instruction &I) { MDNode *getValidBranchWeightMDNode(const Instruction &I) { auto *ProfileData = getBranchWeightMDNode(I); - auto Offset = getBranchWeightOffset(ProfileData); - if (ProfileData && - ProfileData->getNumOperands() == Offset + I.getNumSuccessors()) + if (ProfileData && ProfileData->getNumOperands() == 1 + I.getNumSuccessors()) return ProfileData; return nullptr; } @@ -211,8 +191,7 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) { return false; if (ProfDataName->getString() == "branch_weights") { - unsigned Offset = getBranchWeightOffset(ProfileData); - for (unsigned Idx = Offset; Idx < ProfileData->getNumOperands(); ++Idx) { + for (unsigned Idx = 1; Idx < ProfileData->getNumOperands(); Idx++) { auto *V = mdconst::dyn_extract(ProfileData->getOperand(Idx)); assert(V && "Malformed branch_weight in MD_prof node"); TotalVal += V->getValue().getZExtValue(); @@ -233,10 +212,9 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalVal) { return extractProfTotalWeight(I.getMetadata(LLVMContext::MD_prof), TotalVal); } -void setBranchWeights(Instruction &I, ArrayRef Weights, - bool IsExpected) { +void setBranchWeights(Instruction &I, ArrayRef Weights) { MDBuilder MDB(I.getContext()); - MDNode *BranchWeights = MDB.createBranchWeights(Weights, IsExpected); + MDNode *BranchWeights = MDB.createBranchWeights(Weights); I.setMetadata(LLVMContext::MD_prof, BranchWeights); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index fe2253dd04eb35..e5927203f33a20 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -104,7 +104,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSlotTracker.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -4809,10 +4808,8 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) { // Check consistency of !prof branch_weights metadata. if (ProfName == "branch_weights") { - unsigned int Offset = getBranchWeightOffset(MD); if (isa(&I)) { - Check(MD->getNumOperands() == (1 + Offset) || - MD->getNumOperands() == (2 + Offset), + Check(MD->getNumOperands() == 2 || MD->getNumOperands() == 3, "Wrong number of InvokeInst branch_weights operands", MD); } else { unsigned ExpectedNumOperands = 0; @@ -4832,10 +4829,10 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) { CheckFailed("!prof branch_weights are not allowed for this instruction", MD); - Check(MD->getNumOperands() == Offset + ExpectedNumOperands, + Check(MD->getNumOperands() == 1 + ExpectedNumOperands, "Wrong number of operands", MD); } - for (unsigned i = Offset; i < MD->getNumOperands(); ++i) { + for (unsigned i = 1; i < MD->getNumOperands(); ++i) { auto &MDO = MD->getOperand(i); Check(MDO, "second operand should not be null", MD); Check(mdconst::dyn_extract(MDO), diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 7e6a8817b7a67a..92ad4c34da6e7e 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1662,8 +1662,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { else if (OverwriteExistingWeights) I.setMetadata(LLVMContext::MD_prof, nullptr); } else if (!isa(&I)) { - setBranchWeights(I, {static_cast(BlockWeights[BB])}, - /*IsExpected=*/false); + setBranchWeights(I, {static_cast(BlockWeights[BB])}); } } } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) { @@ -1674,7 +1673,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { if (cast(I).isIndirectCall()) { I.setMetadata(LLVMContext::MD_prof, nullptr); } else { - setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false); + setBranchWeights(I, {uint32_t(0)}); } } } @@ -1757,7 +1756,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { if (MaxWeight > 0 && (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) { LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); - setBranchWeights(*TI, Weights, /*IsExpected=*/false); + setBranchWeights(*TI, Weights); ORE->emit([&]() { return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst) << "most popular destination for conditional branches at " diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 731104d4fcef02..0a3d8d6000cf47 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1878,7 +1878,7 @@ void CHR::fixupBranchesAndSelects(CHRScope *Scope, static_cast(CHRBranchBias.scale(1000)), static_cast(CHRBranchBias.getCompl().scale(1000)), }; - setBranchWeights(*MergedBR, Weights, /*IsExpected=*/false); + setBranchWeights(*MergedBR, Weights); CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1] << "\n"); } diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 6db76ca78b218a..23a7c6a20aecbc 100644 --- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -259,8 +259,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee, promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights); if (AttachProfToDirectCall) { - setBranchWeights(NewInst, {static_cast(Count)}, - /*IsExpected=*/false); + setBranchWeights(NewInst, {static_cast(Count)}); } using namespace ore; diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index ac6d3348b3db9c..2269c2e0fffae9 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -1474,8 +1474,7 @@ void PGOUseFunc::populateCoverage(IndexedInstrProfReader *PGOReader) { for (auto *Succ : successors(&BB)) Weights.push_back((Coverage[Succ] || !Coverage[&BB]) ? 1 : 0); if (Weights.size() >= 2) - llvm::setBranchWeights(*BB.getTerminator(), Weights, - /*IsExpected=*/false); + llvm::setBranchWeights(*BB.getTerminator(), Weights); } unsigned NumCorruptCoverage = 0; @@ -2261,7 +2260,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false); - setBranchWeights(*TI, Weights, /*IsExpected=*/false); + setBranchWeights(*TI, Weights); if (EmitBranchProbability) { std::string BrCondStr = getBranchCondString(TI); if (BrCondStr.empty()) diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b9583836aea065..74a8f1958dfe93 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -231,7 +231,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { Weights[0] = BP.getCompl().getNumerator(); Weights[1] = BP.getNumerator(); } - setBranchWeights(*PredBr, Weights, hasBranchWeightOrigin(*PredBr)); + setBranchWeights(*PredBr, Weights); } } @@ -2618,7 +2618,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, Weights.push_back(Prob.getNumerator()); auto TI = BB->getTerminator(); - setBranchWeights(*TI, Weights, hasBranchWeightOrigin(*TI)); + setBranchWeights(*TI, Weights); } } diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 17c5a4ee1fd0be..6f87e4d91d2c79 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -102,7 +102,7 @@ static bool handleSwitchExpect(SwitchInst &SI) { misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true); SI.setCondition(ArgValue); - setBranchWeights(SI, Weights, /*IsExpected=*/true); + setBranchWeights(SI, Weights); return true; } @@ -262,13 +262,11 @@ static void handlePhiDef(CallInst *Expect) { if (IsOpndComingFromSuccessor(BI->getSuccessor(1))) BI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(LikelyBranchWeightVal, - UnlikelyBranchWeightVal, - /*IsExpected=*/true)); + UnlikelyBranchWeightVal)); else if (IsOpndComingFromSuccessor(BI->getSuccessor(0))) BI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(UnlikelyBranchWeightVal, - LikelyBranchWeightVal, - /*IsExpected=*/true)); + LikelyBranchWeightVal)); } } @@ -333,12 +331,12 @@ template static bool handleBrSelExpect(BrSelInst &BSI) { SmallVector ExpectedWeights; if ((ExpectedValue->getZExtValue() == ValueComparedTo) == (Predicate == CmpInst::ICMP_EQ)) { - Node = MDB.createBranchWeights( - LikelyBranchWeightVal, UnlikelyBranchWeightVal, /*IsExpected=*/true); + Node = + MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal); ExpectedWeights = {LikelyBranchWeightVal, UnlikelyBranchWeightVal}; } else { - Node = MDB.createBranchWeights(UnlikelyBranchWeightVal, - LikelyBranchWeightVal, /*IsExpected=*/true); + Node = + MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal); ExpectedWeights = {UnlikelyBranchWeightVal, LikelyBranchWeightVal}; } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 12229123675e79..ce0f4c7668a40e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -231,7 +231,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, // Remove weight for this case. std::swap(Weights[Idx + 1], Weights.back()); Weights.pop_back(); - setBranchWeights(*SI, Weights, hasBranchWeightOrigin(MD)); + setBranchWeights(*SI, Weights); } // Remove this entry. BasicBlock *ParentBB = SI->getParent(); diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index d517ec3d6e9f21..e2516930d251b9 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -680,7 +680,7 @@ struct WeightInfo { /// To avoid dealing with division rounding we can just multiple both part /// of weights to E and use weight as (F - I * E, E). static void updateBranchWeights(Instruction *Term, WeightInfo &Info) { - setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false); + setBranchWeights(*Term, Info.Weights); for (auto [Idx, SubWeight] : enumerate(Info.SubWeights)) if (SubWeight != 0) // Don't set the probability of taking the edge from latch to loop header @@ -1073,7 +1073,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, } for (const auto &[Term, Info] : Weights) { - setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false); + setBranchWeights(*Term, Info.Weights); } // Update Metadata for count of peeled off iterations. diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 04042e71a2b82e..3d950b151cd32f 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -390,13 +390,13 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI, SuccsSwapped ? LoopBackWeight : ExitWeight1, SuccsSwapped ? ExitWeight1 : LoopBackWeight, }; - setBranchWeights(LoopBI, LoopBIWeights, /*IsExpected=*/false); + setBranchWeights(LoopBI, LoopBIWeights); if (HasConditionalPreHeader) { const uint32_t PreHeaderBIWeights[] = { SuccsSwapped ? EnterWeight : ExitWeight0, SuccsSwapped ? ExitWeight0 : EnterWeight, }; - setBranchWeights(PreHeaderBI, PreHeaderBIWeights, /*IsExpected=*/false); + setBranchWeights(PreHeaderBI, PreHeaderBIWeights); } } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 107c8bb6c027fa..292739b6c5fdab 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -861,28 +861,26 @@ static bool ValuesOverlap(std::vector &C1, // Set branch weights on SwitchInst. This sets the metadata if there is at // least one non-zero weight. -static void setBranchWeights(SwitchInst *SI, ArrayRef Weights, - bool IsExpected) { +static void setBranchWeights(SwitchInst *SI, ArrayRef Weights) { // Check that there is at least one non-zero weight. Otherwise, pass // nullptr to setMetadata which will erase the existing metadata. MDNode *N = nullptr; if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; })) - N = MDBuilder(SI->getParent()->getContext()) - .createBranchWeights(Weights, IsExpected); + N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights); SI->setMetadata(LLVMContext::MD_prof, N); } // Similar to the above, but for branch and select instructions that take // exactly 2 weights. static void setBranchWeights(Instruction *I, uint32_t TrueWeight, - uint32_t FalseWeight, bool IsExpected) { + uint32_t FalseWeight) { assert(isa(I) || isa(I)); // Check that there is at least one non-zero weight. Otherwise, pass // nullptr to setMetadata which will erase the existing metadata. MDNode *N = nullptr; if (TrueWeight || FalseWeight) N = MDBuilder(I->getParent()->getContext()) - .createBranchWeights(TrueWeight, FalseWeight, IsExpected); + .createBranchWeights(TrueWeight, FalseWeight); I->setMetadata(LLVMContext::MD_prof, N); } @@ -1340,7 +1338,7 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding( SmallVector MDWeights(Weights.begin(), Weights.end()); - setBranchWeights(NewSI, MDWeights, /*IsExpected=*/false); + setBranchWeights(NewSI, MDWeights); } EraseTerminatorAndDCECond(PTI); @@ -3833,7 +3831,7 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI, FitWeights(NewWeights); SmallVector MDWeights(NewWeights.begin(), NewWeights.end()); - setBranchWeights(PBI, MDWeights[0], MDWeights[1], /*IsExpected=*/false); + setBranchWeights(PBI, MDWeights[0], MDWeights[1]); // TODO: If BB is reachable from all paths through PredBlock, then we // could replace PBI's branch probabilities with BI's. @@ -4570,7 +4568,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // Halve the weights if any of them cannot fit in an uint32_t FitWeights(NewWeights); - setBranchWeights(PBI, NewWeights[0], NewWeights[1], /*IsExpected=*/false); + setBranchWeights(PBI, NewWeights[0], NewWeights[1]); } // OtherDest may have phi nodes. If so, add an entry from PBI's @@ -4606,8 +4604,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, FitWeights(NewWeights); - setBranchWeights(NV, NewWeights[0], NewWeights[1], - /*IsExpected=*/false); + setBranchWeights(NV, NewWeights[0], NewWeights[1]); } } } @@ -4670,7 +4667,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm, // Create a conditional branch sharing the condition of the select. BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB); if (TrueWeight != FalseWeight) - setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false); + setBranchWeights(NewBI, TrueWeight, FalseWeight); } } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { // Neither of the selected blocks were successors, so this @@ -5620,7 +5617,7 @@ bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI, TrueWeight /= 2; FalseWeight /= 2; } - setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false); + setBranchWeights(NewBI, TrueWeight, FalseWeight); } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1acecf2738b5f2..c7c19ef456c7cb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2145,7 +2145,7 @@ class GeneratedRTChecks { BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond); if (AddBranchWeights) - setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); + setBranchWeights(BI, SCEVCheckBypassWeights); ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); return SCEVCheckBlock; } @@ -2173,7 +2173,7 @@ class GeneratedRTChecks { BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); if (AddBranchWeights) { - setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false); + setBranchWeights(BI, MemCheckBypassWeights); } ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI); MemCheckBlock->getTerminator()->setDebugLoc( @@ -2889,7 +2889,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); + setBranchWeights(BI, MinItersBypassWeights); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); LoopBypassBlocks.push_back(TCCheckBlock); } @@ -3128,7 +3128,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { unsigned TripCount = UF * VF.getKnownMinValue(); assert(TripCount > 0 && "trip count should not be zero"); const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(BI, Weights, /*IsExpected=*/false); + setBranchWeights(BI, Weights); } } @@ -7669,7 +7669,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) - setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); + setBranchWeights(BI, MinItersBypassWeights); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); return TCCheckBlock; @@ -7826,7 +7826,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); const uint32_t Weights[] = {EstimatedSkipCount, MainLoopStep - EstimatedSkipCount}; - setBranchWeights(BI, Weights, /*IsExpected=*/false); + setBranchWeights(BI, Weights); } ReplaceInstWithInst(Insert->getTerminator(), &BI); diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll b/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll index 8e06cd57a10014..0abca5b383224a 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll @@ -284,7 +284,7 @@ define i32 @test10(i64 %t6) { declare i1 @llvm.expect.i1(i1, i1) nounwind readnone -; CHECK: !0 = !{!"branch_weights", !"expected", i32 2000, i32 1} -; CHECK: !1 = !{!"branch_weights", !"expected", i32 1, i32 2000} -; CHECK: !2 = !{!"branch_weights", !"expected", i32 1, i32 1, i32 2000} -; CHECK: !3 = !{!"branch_weights", !"expected", i32 2000, i32 1, i32 1} +; CHECK: !0 = !{!"branch_weights", i32 2000, i32 1} +; CHECK: !1 = !{!"branch_weights", i32 1, i32 2000} +; CHECK: !2 = !{!"branch_weights", i32 1, i32 1, i32 2000} +; CHECK: !3 = !{!"branch_weights", i32 2000, i32 1, i32 1} diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll b/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll index 40571278ca93f3..64293557050c12 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll @@ -284,7 +284,7 @@ define i32 @test10(i64 %t6) { declare i1 @llvm.expect.with.probability.i1(i1, i1, double) nounwind readnone -; CHECK: !0 = !{!"branch_weights", !"expected", i32 1717986918, i32 429496731} -; CHECK: !1 = !{!"branch_weights", !"expected", i32 429496731, i32 1717986918} -; CHECK: !2 = !{!"branch_weights", !"expected", i32 214748366, i32 214748366, i32 1717986918} -; CHECK: !3 = !{!"branch_weights", !"expected", i32 1717986918, i32 214748366, i32 214748366} +; CHECK: !0 = !{!"branch_weights", i32 1717986918, i32 429496731} +; CHECK: !1 = !{!"branch_weights", i32 429496731, i32 1717986918} +; CHECK: !2 = !{!"branch_weights", i32 214748366, i32 214748366, i32 1717986918} +; CHECK: !3 = !{!"branch_weights", i32 1717986918, i32 214748366, i32 214748366} diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll b/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll index 458a7758fa9707..2bcfb1e064be96 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll @@ -99,5 +99,6 @@ attributes #1 = { nounwind readnone } !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{!"clang version 5.0.0 (trunk 304373)"} -; CHECK: [[LIKELY]] = !{!"branch_weights", !"expected", i32 2000, i32 1} -; CHECK: [[UNLIKELY]] = !{!"branch_weights", !"expected", i32 1, i32 2000} +; CHECK: [[LIKELY]] = !{!"branch_weights", i32 2000, i32 1} +; CHECK: [[UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000} + diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll index 9b9d9a746dd324..32ae9b0b2f15cb 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll @@ -352,5 +352,5 @@ declare i64 @llvm.expect.i64(i64, i64) !llvm.ident = !{!0} !0 = !{!"clang version 5.0.0 (trunk 302965)"} -; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 2000, i32 1} -; CHECK: [[WEIGHT2]] = !{!"branch_weights", !"expected", i32 1, i32 2000} +; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1} +; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000} diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll index e9a843225993ad..1efa63241c2c06 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll @@ -99,5 +99,5 @@ declare i64 @llvm.expect.i64(i64, i64) !0 = !{!"clang version 5.0.0 (trunk 302965)"} -; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 2000, i32 1} -; CHECK: [[WEIGHT2]] = !{!"branch_weights", !"expected", i32 1, i32 2000} +; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1} +; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000} diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll index 13db2c394bab29..9cbaca8d13dc0a 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll @@ -53,4 +53,4 @@ declare i64 @llvm.expect.i64(i64, i64) !0 = !{!"clang version 5.0.0 (trunk 302965)"} -; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 1, i32 2000} +; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 1, i32 2000} diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll index 275731d6188955..2bad66343b761b 100644 --- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll +++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll @@ -235,5 +235,5 @@ block5: ret void } -; CHECK: !0 = !{!"branch_weights", !"expected", i32 2147483647, i32 1} -; CHECK: !1 = !{!"branch_weights", !"expected", i32 1, i32 2147483647} +; CHECK: !0 = !{!"branch_weights", i32 2147483647, i32 1} +; CHECK: !1 = !{!"branch_weights", i32 1, i32 2147483647} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll index a090fe20e1d937..55dd28b70170b8 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll @@ -293,7 +293,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} -; CHECK: [[PROF4]] = !{!"branch_weights", !"expected", i32 2000, i32 1} +; CHECK: [[PROF4]] = !{!"branch_weights", i32 2000, i32 1} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} ;. From 1bebb99324a1cd85e18e1907c7afdde5d2bc4593 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Tue, 11 Jun 2024 14:06:23 +0800 Subject: [PATCH 21/82] [RISCV] Add B extension (#76893) It seems that we have `B` extension again: https://github.com/riscv/riscv-b According to the spec, `B` extension represents the collection of the `Zba`, `Zbb`, `Zbs` extensions. Though it hasn't been ratified, I set its version to `1.0`. --- clang/test/Driver/riscv-arch.c | 5 ----- clang/test/Preprocessor/riscv-target-features.c | 12 ++++++++++++ llvm/docs/RISCVUsage.rst | 1 + llvm/docs/ReleaseNotes.rst | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 8 ++++++++ llvm/test/CodeGen/RISCV/attributes.ll | 4 ++++ llvm/unittests/TargetParser/RISCVISAInfoTest.cpp | 9 ++------- 7 files changed, 28 insertions(+), 12 deletions(-) diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c index ddf617bbb62372..ffd92e1f398c45 100644 --- a/clang/test/Driver/riscv-arch.c +++ b/clang/test/Driver/riscv-arch.c @@ -231,11 +231,6 @@ // RV32-STD: error: invalid arch name 'rv32imqc', // RV32-STD: unsupported standard user-level extension 'q' -// RUN: not %clang --target=riscv32-unknown-elf -march=rv32ib -### %s \ -// RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-B %s -// RV32-B: error: invalid arch name 'rv32ib', -// RV32-B: unsupported standard user-level extension 'b' - // RUN: not %clang --target=riscv32-unknown-elf -march=rv32xabc -### %s \ // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X %s // RV32X: error: invalid arch name 'rv32xabc', diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index 09b9ad0a160bb1..91307141e0406b 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -7,6 +7,7 @@ // CHECK-NOT: __riscv_64e {{.*$}} // CHECK-NOT: __riscv_a {{.*$}} // CHECK-NOT: __riscv_atomic +// CHECK-NOT: __riscv_b {{.*$}} // CHECK-NOT: __riscv_c {{.*$}} // CHECK-NOT: __riscv_compressed {{.*$}} // CHECK-NOT: __riscv_d {{.*$}} @@ -194,6 +195,17 @@ // CHECK-A-EXT: __riscv_a 2001000{{$}} // CHECK-A-EXT: __riscv_atomic 1 +// RUN: %clang --target=riscv32-unknown-linux-gnu \ +// RUN: -march=rv32ib -x c -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-B-EXT %s +// RUN: %clang --target=riscv64-unknown-linux-gnu \ +// RUN: -march=rv64ib -x c -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-B-EXT %s +// CHECK-B-EXT: __riscv_b 1000000{{$}} +// CHECK-B-EXT: __riscv_zba 1000000{{$}} +// CHECK-B-EXT: __riscv_zbb 1000000{{$}} +// CHECK-B-EXT: __riscv_zbs 1000000{{$}} + // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32ic -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-C-EXT %s diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 35115e67ecf924..ef06f80c747f94 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -85,6 +85,7 @@ on support follow. Extension Status ================ ================================================================= ``A`` Supported + ``B`` Supported ``C`` Supported ``D`` Supported ``F`` Supported diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 8cdb9db087c778..b46994bbcd66de 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -152,6 +152,7 @@ Changes to the RISC-V Backend * Zaamo and Zalrsc are no longer experimental. * Processors that enable post reg-alloc scheduling (PostMachineScheduler) by default should use the `UsePostRAScheduler` subtarget feature. Setting `PostRAScheduler = 1` in the scheduler model will have no effect on the enabling of the PostMachineScheduler. * Zabha is no longer experimental. +* B (the collection of the Zba, Zbb, Zbs extensions) is supported. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 9bf06850483d84..011edca019fd60 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -477,6 +477,14 @@ def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">, // Bitmanip Extensions for Cryptography Extensions +def FeatureStdExtB + : RISCVExtension<"b", 1, 0, + "'B' (the collection of the Zba, Zbb, Zbs extensions)", + [FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs]>; +def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">, + AssemblerPredicate<(all_of FeatureStdExtB), + "'B' (the collection of the Zba, Zbb, Zbs extensions)">; + def FeatureStdExtZbkb : RISCVExtension<"zbkb", 1, 0, "'Zbkb' (Bitmanip instructions for Cryptography)">; diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 1c9356fb1a05a7..86b557700347e1 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV32ZMMUL %s ; RUN: llc -mtriple=riscv32 -mattr=+m,+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV32MZMMUL %s ; RUN: llc -mtriple=riscv32 -mattr=+a %s -o - | FileCheck --check-prefixes=CHECK,RV32A %s +; RUN: llc -mtriple=riscv32 -mattr=+b %s -o - | FileCheck --check-prefixes=CHECK,RV32B %s ; RUN: llc -mtriple=riscv32 -mattr=+f %s -o - | FileCheck --check-prefixes=CHECK,RV32F %s ; RUN: llc -mtriple=riscv32 -mattr=+d %s -o - | FileCheck --check-prefixes=CHECK,RV32D %s ; RUN: llc -mtriple=riscv32 -mattr=+c %s -o - | FileCheck --check-prefixes=CHECK,RV32C %s @@ -131,6 +132,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV64ZMMUL %s ; RUN: llc -mtriple=riscv64 -mattr=+m,+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV64MZMMUL %s ; RUN: llc -mtriple=riscv64 -mattr=+a %s -o - | FileCheck --check-prefixes=CHECK,RV64A %s +; RUN: llc -mtriple=riscv64 -mattr=+b %s -o - | FileCheck --check-prefixes=CHECK,RV64B %s ; RUN: llc -mtriple=riscv64 -mattr=+f %s -o - | FileCheck --check-prefixes=CHECK,RV64F %s ; RUN: llc -mtriple=riscv64 -mattr=+d %s -o - | FileCheck --check-prefixes=CHECK,RV64D %s ; RUN: llc -mtriple=riscv64 -mattr=+c %s -o - | FileCheck --check-prefixes=CHECK,RV64C %s @@ -277,6 +279,7 @@ ; RV32ZMMUL: .attribute 5, "rv32i2p1_zmmul1p0" ; RV32MZMMUL: .attribute 5, "rv32i2p1_m2p0_zmmul1p0" ; RV32A: .attribute 5, "rv32i2p1_a2p1" +; RV32B: .attribute 5, "rv32i2p1_b1p0_zba1p0_zbb1p0_zbs1p0" ; RV32F: .attribute 5, "rv32i2p1_f2p2_zicsr2p0" ; RV32D: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0" ; RV32C: .attribute 5, "rv32i2p1_c2p0" @@ -402,6 +405,7 @@ ; RV64ZMMUL: .attribute 5, "rv64i2p1_zmmul1p0" ; RV64MZMMUL: .attribute 5, "rv64i2p1_m2p0_zmmul1p0" ; RV64A: .attribute 5, "rv64i2p1_a2p1" +; RV64B: .attribute 5, "rv64i2p1_b1p0_zba1p0_zbb1p0_zbs1p0" ; RV64F: .attribute 5, "rv64i2p1_f2p2_zicsr2p0" ; RV64D: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0" ; RV64C: .attribute 5, "rv64i2p1_c2p0" diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 22a8a05ef3719a..128321fc3ae731 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -312,8 +312,6 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) { } TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) { - EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64ib", true).takeError()), - "unsupported standard user-level extension 'b'"); EXPECT_EQ( toString( RISCVISAInfo::parseArchString("rv32i_zmadeup", true).takeError()), @@ -326,9 +324,6 @@ TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) { toString( RISCVISAInfo::parseArchString("rv64g_xmadeup", true).takeError()), "unsupported non-standard user-level extension 'xmadeup'"); - EXPECT_EQ( - toString(RISCVISAInfo::parseArchString("rv64ib1p0", true).takeError()), - "unsupported standard user-level extension 'b'"); EXPECT_EQ( toString( RISCVISAInfo::parseArchString("rv32i_zmadeup1p0", true).takeError()), @@ -344,8 +339,7 @@ TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) { } TEST(ParseArchString, IgnoresUnrecognizedExtensionNamesWithIgnoreUnknown) { - for (StringRef Input : {"rv32ib", "rv32i_zmadeup", - "rv64i_smadeup", "rv64i_xmadeup"}) { + for (StringRef Input : {"rv32i_zmadeup", "rv64i_smadeup", "rv64i_xmadeup"}) { auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true); ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded()); RISCVISAInfo &Info = **MaybeISAInfo; @@ -913,6 +907,7 @@ R"(All available -march extensions for RISC-V f 2.2 d 2.2 c 2.0 + b 1.0 v 1.0 h 1.0 zic64b 1.0 From 34033dc1610982570328b39a38596a9968e3c7b2 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 11 Jun 2024 08:08:03 +0200 Subject: [PATCH 22/82] [clangd] Use clang_target_link_libraries() for clang libs (#94937) Use clang_target_link_libraries() instead of LINK_LIBS when linking clang libraries. This ensures that in CLANG_LINK_CLANG_DYLIB mode we link against libclang-cpp.so (instead of linking against both it and the static libraries). Most places were already doing this correctly, there were just a handful of leftovers. --- clang-tools-extra/clangd/index/remote/CMakeLists.txt | 6 +++++- clang-tools-extra/pseudo/lib/CMakeLists.txt | 8 ++++++-- clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt | 6 +++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/clangd/index/remote/CMakeLists.txt b/clang-tools-extra/clangd/index/remote/CMakeLists.txt index ed6269d2ccaa98..106bbeff84ccf3 100644 --- a/clang-tools-extra/clangd/index/remote/CMakeLists.txt +++ b/clang-tools-extra/clangd/index/remote/CMakeLists.txt @@ -26,7 +26,6 @@ if (CLANGD_ENABLE_REMOTE) clangdRemoteIndexProto clangdRemoteIndexServiceProto clangdRemoteMarshalling - clangBasic clangDaemon clangdSupport @@ -35,6 +34,11 @@ if (CLANGD_ENABLE_REMOTE) clangdRemoteIndexServiceProto ) + clang_target_link_libraries(clangdRemoteIndex + PRIVATE + clangBasic + ) + add_subdirectory(marshalling) add_subdirectory(server) add_subdirectory(monitor) diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt index f92f79be121508..a13b5d20cf7c3b 100644 --- a/clang-tools-extra/pseudo/lib/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt @@ -14,8 +14,6 @@ add_clang_library(clangPseudo Token.cpp LINK_LIBS - clangBasic - clangLex clangPseudoGrammar DEPENDS @@ -25,3 +23,9 @@ add_clang_library(clangPseudo target_include_directories(clangPseudo INTERFACE $ ) + +clang_target_link_libraries(clangPseudo + PRIVATE + clangBasic + clangLex + ) diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt index d56d16c893c3d4..2fecdce6a10f9c 100644 --- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt +++ b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt @@ -9,7 +9,11 @@ add_clang_library(clangPseudoCXX cxx_gen LINK_LIBS - clangBasic clangPseudo clangPseudoGrammar ) + +clang_target_link_libraries(clangPseudoCXX + PRIVATE + clangBasic + ) From bfa8150ef4748a364a496f8cdf285380ac3ec876 Mon Sep 17 00:00:00 2001 From: Johannes Reifferscheid Date: Tue, 11 Jun 2024 08:20:35 +0200 Subject: [PATCH 23/82] Explain partial byte extraction logic. (#92868) This is a follow-up to #92506. --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 82770f8660850c..ca077d41d36bac 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1845,6 +1845,10 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, auto AddIntToBuffer = [AggBuffer, Bytes](const APInt &Val) { size_t NumBytes = (Val.getBitWidth() + 7) / 8; SmallVector Buf(NumBytes); + // `extractBitsAsZExtValue` does not allow the extraction of bits beyond the + // input's bit width, and i1 arrays may not have a length that is a multuple + // of 8. We handle the last byte separately, so we never request out of + // bounds bits. for (unsigned I = 0; I < NumBytes - 1; ++I) { Buf[I] = Val.extractBitsAsZExtValue(8, I * 8); } From 876c6204f12fa2738ff8ca886e664b826847d6d4 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Tue, 11 Jun 2024 14:28:51 +0800 Subject: [PATCH 24/82] [RISCV][MC] Warn if SEW/LMUL may not be compatible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to RVV spec: > In general, the requirement is to support LMUL ≥ SEWMIN/ELEN, > where SEWMIN is the narrowest supported SEW value and ELEN is > the widest supported SEW value. > > For a given supported fractional LMUL setting, implementations > must support SEW settings between SEWMIN and LMUL * ELEN, inclusive. We print a warning if these requirements are not met. Reviewers: kito-cheng, asb, frasercrmck, jrtc27, michaelmaitland, lukel97 Reviewed By: lukel97 Pull Request: https://github.com/llvm/llvm-project/pull/94313 --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 21 +++++++++++++++++++ llvm/test/MC/RISCV/rvv/vsetvl.s | 5 +++++ 2 files changed, 26 insertions(+) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 5906a2cdb3bfa1..8ac1cdf0a7a9ce 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2155,6 +2155,16 @@ bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State, break; if (!RISCVVType::isValidLMUL(Lmul, Fractional)) break; + + if (Fractional) { + unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32; + unsigned MinLMUL = ELEN / 8; + if (Lmul > MinLMUL) + Warning(Tok.getLoc(), + "use of vtype encodings with LMUL < SEWMIN/ELEN == mf" + + Twine(MinLMUL) + " is reserved"); + } + State = VTypeState_TailPolicy; return false; } @@ -2194,6 +2204,7 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { bool MaskAgnostic = false; VTypeState State = VTypeState_SEW; + SMLoc SEWLoc = S; if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic, MaskAgnostic)) @@ -2211,6 +2222,16 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) { if (getLexer().is(AsmToken::EndOfStatement) && State == VTypeState_Done) { RISCVII::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional); + if (Fractional) { + unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32; + unsigned MaxSEW = ELEN / Lmul; + // If MaxSEW < 8, we should have printed warning about reserved LMUL. + if (MaxSEW >= 8 && Sew > MaxSEW) + Warning(SEWLoc, + "use of vtype encodings with SEW > " + Twine(MaxSEW) + + " and LMUL == mf" + Twine(Lmul) + + " may not be compatible with all RVV implementations"); + } unsigned VTypeI = RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic); diff --git a/llvm/test/MC/RISCV/rvv/vsetvl.s b/llvm/test/MC/RISCV/rvv/vsetvl.s index c9197d8917a472..2741def0eeff21 100644 --- a/llvm/test/MC/RISCV/rvv/vsetvl.s +++ b/llvm/test/MC/RISCV/rvv/vsetvl.s @@ -1,5 +1,7 @@ # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \ # RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+zve32x %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ZVE32X # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ # RUN: | FileCheck %s --check-prefix=CHECK-ERROR # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \ @@ -71,18 +73,21 @@ vsetvli a2, a0, e32, m8, ta, ma vsetvli a2, a0, e32, mf2, ta, ma # CHECK-INST: vsetvli a2, a0, e32, mf2, ta, ma +# CHECK-ZVE32X: :[[#@LINE-2]]:17: warning: use of vtype encodings with SEW > 16 and LMUL == mf2 may not be compatible with all RVV implementations{{$}} # CHECK-ENCODING: [0x57,0x76,0x75,0x0d] # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}} # CHECK-UNKNOWN: 0d757657 vsetvli a2, a0, e32, mf4, ta, ma # CHECK-INST: vsetvli a2, a0, e32, mf4, ta, ma +# CHECK-ZVE32X: :[[#@LINE-2]]:17: warning: use of vtype encodings with SEW > 8 and LMUL == mf4 may not be compatible with all RVV implementations{{$}} # CHECK-ENCODING: [0x57,0x76,0x65,0x0d] # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}} # CHECK-UNKNOWN: 0d657657 vsetvli a2, a0, e32, mf8, ta, ma # CHECK-INST: vsetvli a2, a0, e32, mf8, ta, ma +# CHECK-ZVE32X: :[[#@LINE-2]]:22: warning: use of vtype encodings with LMUL < SEWMIN/ELEN == mf4 is reserved{{$}} # CHECK-ENCODING: [0x57,0x76,0x55,0x0d] # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}} # CHECK-UNKNOWN: 0d557657 From 282534268e3be0949fcac1589b1004bc5672b434 Mon Sep 17 00:00:00 2001 From: martinboehme Date: Tue, 11 Jun 2024 08:38:03 +0200 Subject: [PATCH 25/82] [clang][dataflow] Handle `AtomicExpr` in `ResultObjectVisitor`. (#94963) This is one of the node kinds that should be considered an "original initializer". The patch adds a test that was causing an assertion failure in `assert(Children.size() == 1)` without the fix. --- .../FlowSensitive/DataflowEnvironment.cpp | 2 +- .../Analysis/FlowSensitive/TransferTest.cpp | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index 0d7967c8b93449..7c88917faf9c65 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -415,7 +415,7 @@ class ResultObjectVisitor : public AnalysisASTVisitor { // below them can initialize the same object (or part of it). if (isa(E) || isa(E) || isa(E) || isa(E) || isa(E) || - isa(E) || + isa(E) || isa(E) || // We treat `BuiltinBitCastExpr` as an "original initializer" too as // it may not even be casting from a record type -- and even if it is, // the two objects are in general of unrelated type. diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index f7e6b0c22e8db2..2a74d7fa63fd74 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -3345,6 +3345,32 @@ TEST(TransferTest, ResultObjectLocationForBuiltinBitCastExpr) { }); } +TEST(TransferTest, ResultObjectLocationForAtomicExpr) { + std::string Code = R"( + struct S {}; + void target(_Atomic(S) *ptr) { + S s = __c11_atomic_load(ptr, __ATOMIC_SEQ_CST); + // [[p]] + } + )"; + using ast_matchers::atomicExpr; + using ast_matchers::match; + using ast_matchers::selectFirst; + using ast_matchers::traverse; + runDataflow( + Code, + [](const llvm::StringMap> &Results, + ASTContext &ASTCtx) { + const Environment &Env = getEnvironmentAtAnnotation(Results, "p"); + + auto *Atomic = selectFirst( + "atomic", match(atomicExpr().bind("atomic"), ASTCtx)); + + EXPECT_EQ(&Env.getResultObjectLocation(*Atomic), + &getLocForDecl(ASTCtx, Env, "s")); + }); +} + TEST(TransferTest, ResultObjectLocationPropagatesThroughConditionalOperator) { std::string Code = R"( struct A { From 05e87a583ce2680fc7309491c8bc23fd0e3bd049 Mon Sep 17 00:00:00 2001 From: aengelke Date: Tue, 11 Jun 2024 08:39:42 +0200 Subject: [PATCH 26/82] Reland "[MC][NFC] Make ELFUniquingMap a StringMap (#95006)" (#95030) This avoids std::map, which is slow, and uses a StringMap. Section name, group name, linked-to name and unique id are encoded into the key for fast lookup. This gives a measurable performance boost for applications that compile many small object files (e.g., functions in JIT compilers). --- Now also the second case works properly. That's what happens when you do that last refactoring without re-running all tests... sorry. --- llvm/include/llvm/MC/MCContext.h | 27 +----------------- llvm/lib/MC/MCContext.cpp | 47 +++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h index c560b62f802e5d..72eae85467dc9d 100644 --- a/llvm/include/llvm/MC/MCContext.h +++ b/llvm/include/llvm/MC/MCContext.h @@ -252,31 +252,6 @@ class MCContext { /// A collection of MCPseudoProbe in the current module MCPseudoProbeTable PseudoProbeTable; - // Sections are differentiated by the quadruple (section_name, group_name, - // unique_id, link_to_symbol_name). Sections sharing the same quadruple are - // combined into one section. - struct ELFSectionKey { - std::string SectionName; - StringRef GroupName; - StringRef LinkedToName; - unsigned UniqueID; - - ELFSectionKey(StringRef SectionName, StringRef GroupName, - StringRef LinkedToName, unsigned UniqueID) - : SectionName(SectionName), GroupName(GroupName), - LinkedToName(LinkedToName), UniqueID(UniqueID) {} - - bool operator<(const ELFSectionKey &Other) const { - if (SectionName != Other.SectionName) - return SectionName < Other.SectionName; - if (GroupName != Other.GroupName) - return GroupName < Other.GroupName; - if (int O = LinkedToName.compare(Other.LinkedToName)) - return O < 0; - return UniqueID < Other.UniqueID; - } - }; - struct COFFSectionKey { std::string SectionName; StringRef GroupName; @@ -350,8 +325,8 @@ class MCContext { }; StringMap MachOUniquingMap; - std::map ELFUniquingMap; std::map COFFUniquingMap; + StringMap ELFUniquingMap; std::map GOFFUniquingMap; std::map WasmUniquingMap; std::map XCOFFUniquingMap; diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 74b9f8d2addbcd..15900547179602 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -44,6 +44,7 @@ #include "llvm/MC/SectionKind.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" @@ -547,16 +548,42 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type, if (GroupSym) Group = GroupSym->getName(); assert(!(LinkedToSym && LinkedToSym->getName().empty())); - // Do the lookup, if we have a hit, return it. - auto IterBool = ELFUniquingMap.insert(std::make_pair( - ELFSectionKey{Section.str(), Group, - LinkedToSym ? LinkedToSym->getName() : "", UniqueID}, - nullptr)); - auto &Entry = *IterBool.first; - if (!IterBool.second) - return Entry.second; - StringRef CachedName = Entry.first.SectionName; + // Sections are differentiated by the quadruple (section_name, group_name, + // unique_id, link_to_symbol_name). Sections sharing the same quadruple are + // combined into one section. As an optimization, non-unique sections without + // group or linked-to symbol have a shorter unique-ing key. + std::pair::iterator, bool> EntryNewPair; + // Length of the section name, which are the first SectionLen bytes of the key + unsigned SectionLen; + if (GroupSym || LinkedToSym || UniqueID != MCSection::NonUniqueID) { + SmallString<128> Buffer; + Section.toVector(Buffer); + SectionLen = Buffer.size(); + Buffer.push_back(0); // separator which cannot occur in the name + if (GroupSym) + Buffer.append(GroupSym->getName()); + Buffer.push_back(0); // separator which cannot occur in the name + if (LinkedToSym) + Buffer.append(LinkedToSym->getName()); + support::endian::write(Buffer, UniqueID, endianness::native); + StringRef UniqueMapKey = StringRef(Buffer); + EntryNewPair = ELFUniquingMap.insert(std::make_pair(UniqueMapKey, nullptr)); + } else if (!Section.isSingleStringRef()) { + SmallString<128> Buffer; + StringRef UniqueMapKey = Section.toStringRef(Buffer); + SectionLen = UniqueMapKey.size(); + EntryNewPair = ELFUniquingMap.insert(std::make_pair(UniqueMapKey, nullptr)); + } else { + StringRef UniqueMapKey = Section.getSingleStringRef(); + SectionLen = UniqueMapKey.size(); + EntryNewPair = ELFUniquingMap.insert(std::make_pair(UniqueMapKey, nullptr)); + } + + if (!EntryNewPair.second) + return EntryNewPair.first->second; + + StringRef CachedName = EntryNewPair.first->getKey().take_front(SectionLen); SectionKind Kind; if (Flags & ELF::SHF_ARM_PURECODE) @@ -600,7 +627,7 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type, MCSectionELF *Result = createELFSectionImpl(CachedName, Type, Flags, Kind, EntrySize, GroupSym, IsComdat, UniqueID, LinkedToSym); - Entry.second = Result; + EntryNewPair.first->second = Result; recordELFMergeableSectionInfo(Result->getName(), Result->getFlags(), Result->getUniqueID(), Result->getEntrySize()); From 275196d866c86d95fc46b3324876ccbea09da79b Mon Sep 17 00:00:00 2001 From: martinboehme Date: Tue, 11 Jun 2024 08:40:02 +0200 Subject: [PATCH 27/82] [clang][nullability] Don't return null fields from `getReferencedDecls()`. (#94983) The patch includes a repro for a case where we were returning a null `FieldDecl` when calling `getReferencedDecls()` on the `InitListExpr` for a union. Also, I noticed while working on this that `RecordInitListHelper` has a bug where it doesn't work correctly for empty unions. This patch also includes a repro and fix for this bug. --- clang/docs/tools/clang-formatted-files.txt | 1 + clang/lib/Analysis/FlowSensitive/ASTOps.cpp | 11 ++- .../Analysis/FlowSensitive/ASTOpsTest.cpp | 88 +++++++++++++++++++ .../Analysis/FlowSensitive/CMakeLists.txt | 1 + .../unittests/Analysis/FlowSensitive/BUILD.gn | 1 + 5 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt index dee51e402b687f..4866bd4aee634f 100644 --- a/clang/docs/tools/clang-formatted-files.txt +++ b/clang/docs/tools/clang-formatted-files.txt @@ -622,6 +622,7 @@ clang/tools/libclang/CXCursor.h clang/tools/scan-build-py/tests/functional/src/include/clean-one.h clang/unittests/Analysis/CFGBuildResult.h clang/unittests/Analysis/MacroExpansionContextTest.cpp +clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp clang/unittests/Analysis/FlowSensitive/CNFFormula.cpp clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp index 38b5f51b7b2f02..27d42a7b508562 100644 --- a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp +++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp @@ -100,7 +100,8 @@ getFieldsForInitListExpr(const InitListT *InitList) { std::vector Fields; if (InitList->getType()->isUnionType()) { - Fields.push_back(InitList->getInitializedFieldInUnion()); + if (const FieldDecl *Field = InitList->getInitializedFieldInUnion()) + Fields.push_back(Field); return Fields; } @@ -137,9 +138,11 @@ RecordInitListHelper::RecordInitListHelper( // it doesn't do this -- so we create an `ImplicitValueInitExpr` ourselves. SmallVector InitsForUnion; if (Ty->isUnionType() && Inits.empty()) { - assert(Fields.size() == 1); - ImplicitValueInitForUnion.emplace(Fields.front()->getType()); - InitsForUnion.push_back(&*ImplicitValueInitForUnion); + assert(Fields.size() <= 1); + if (!Fields.empty()) { + ImplicitValueInitForUnion.emplace(Fields.front()->getType()); + InitsForUnion.push_back(&*ImplicitValueInitForUnion); + } Inits = InitsForUnion; } diff --git a/clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp b/clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp new file mode 100644 index 00000000000000..cd1c076ab09e6b --- /dev/null +++ b/clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp @@ -0,0 +1,88 @@ +//===- unittests/Analysis/FlowSensitive/ASTOpsTest.cpp --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/FlowSensitive/ASTOps.h" +#include "TestingSupport.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include + +namespace { + +using namespace clang; +using namespace dataflow; + +using ast_matchers::cxxRecordDecl; +using ast_matchers::hasName; +using ast_matchers::hasType; +using ast_matchers::initListExpr; +using ast_matchers::match; +using ast_matchers::selectFirst; +using test::findValueDecl; +using testing::IsEmpty; +using testing::UnorderedElementsAre; + +TEST(ASTOpsTest, RecordInitListHelperOnEmptyUnionInitList) { + // This is a regression test: The `RecordInitListHelper` used to assert-fail + // when called for the `InitListExpr` of an empty union. + std::string Code = R"cc( + struct S { + S() : UField{} {}; + + union U {} UField; + }; + )cc"; + std::unique_ptr Unit = + tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++17"}); + auto &ASTCtx = Unit->getASTContext(); + + ASSERT_EQ(ASTCtx.getDiagnostics().getClient()->getNumErrors(), 0U); + + auto *InitList = selectFirst( + "init", + match(initListExpr(hasType(cxxRecordDecl(hasName("U")))).bind("init"), + ASTCtx)); + ASSERT_NE(InitList, nullptr); + + RecordInitListHelper Helper(InitList); + EXPECT_THAT(Helper.base_inits(), IsEmpty()); + EXPECT_THAT(Helper.field_inits(), IsEmpty()); +} + +TEST(ASTOpsTest, ReferencedDeclsOnUnionInitList) { + // This is a regression test: `getReferencedDecls()` used to return a null + // `FieldDecl` in this case (in addition to the correct non-null `FieldDecl`) + // because `getInitializedFieldInUnion()` returns null for the syntactic form + // of the `InitListExpr`. + std::string Code = R"cc( + struct S { + S() : UField{0} {}; + + union U { + int I; + } UField; + }; + )cc"; + std::unique_ptr Unit = + tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++17"}); + auto &ASTCtx = Unit->getASTContext(); + + ASSERT_EQ(ASTCtx.getDiagnostics().getClient()->getNumErrors(), 0U); + + auto *InitList = selectFirst( + "init", + match(initListExpr(hasType(cxxRecordDecl(hasName("U")))).bind("init"), + ASTCtx)); + ASSERT_NE(InitList, nullptr); + auto *IDecl = cast(findValueDecl(ASTCtx, "I")); + + EXPECT_THAT(getReferencedDecls(*InitList).Fields, + UnorderedElementsAre(IDecl)); +} + +} // namespace diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index cfabb80576bc12..12fee5dc2789ce 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS add_clang_unittest(ClangAnalysisFlowSensitiveTests ArenaTest.cpp + ASTOpsTest.cpp CFGMatchSwitchTest.cpp ChromiumCheckModelTest.cpp DataflowAnalysisContextTest.cpp diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn index e16ca31b81a8d3..780a69f1f3299b 100644 --- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn @@ -18,6 +18,7 @@ unittest("ClangAnalysisFlowSensitiveTests") { "//llvm/lib/Testing/Support", ] sources = [ + "ASTOpsTest.cpp", "ArenaTest.cpp", "CFGMatchSwitchTest.cpp", "ChromiumCheckModelTest.cpp", From e9a362362e03c6aea7a64473e8b8bb9140fd3855 Mon Sep 17 00:00:00 2001 From: Vikash Gupta <35700483+vg0204@users.noreply.github.com> Date: Tue, 11 Jun 2024 12:10:40 +0530 Subject: [PATCH 28/82] [CodeGen] Preserved additional analyses in StackSlotColoring pass. (#93779) The pass pipeline of some architecture splits register allocation phase based on different register classes. As some analyses need to be computed at the beginning of the register allocation and kept alive till all values are assigned to some physical registers. This poses challenge with objective of introducing StackSlotColoring after partial virtual registers are assigned to physical registers, in order to optimize stack slots usage.As this pass doesn't preserve few analysis yet to be needed by the register allocation of the remaining virtual registers, necessiating them to be kept preserved. --- llvm/lib/CodeGen/StackSlotColoring.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp index 9fdc8a338b52a5..eb7a113b575f75 100644 --- a/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -64,6 +65,7 @@ namespace { MachineFrameInfo *MFI = nullptr; const TargetInstrInfo *TII = nullptr; const MachineBlockFrequencyInfo *MBFI = nullptr; + SlotIndexes *Indexes = nullptr; // SSIntervals - Spill slot intervals. std::vector SSIntervals; @@ -152,6 +154,14 @@ namespace { AU.addRequired(); AU.addPreserved(); AU.addPreservedID(MachineDominatorsID); + + // In some Target's pipeline, register allocation (RA) might be + // split into multiple phases based on register class. So, this pass + // may be invoked multiple times requiring it to save these analyses to be + // used by RA later. + AU.addPreserved(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); } @@ -496,8 +506,11 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) { ++I; } - for (MachineInstr *MI : toErase) + for (MachineInstr *MI : toErase) { + if (Indexes) + Indexes->removeMachineInstrFromMaps(*MI); MI->eraseFromParent(); + } return changed; } @@ -515,6 +528,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); LS = &getAnalysis(); MBFI = &getAnalysis(); + Indexes = &getAnalysis(); bool Changed = false; From 4cff320e0d4574d5084ad751f9ee3ca22c08afa1 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Tue, 11 Jun 2024 09:08:29 +0200 Subject: [PATCH 29/82] [flang] lower LBOUND for assumed-rank arrays (#94995) --- .../flang/Optimizer/Builder/Runtime/Inquiry.h | 8 ++- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 32 ++++++----- .../lib/Optimizer/Builder/Runtime/Inquiry.cpp | 14 +++++ .../Lower/HLFIR/assumed-rank-inquiries-3.f90 | 55 +++++++++++++++++++ 4 files changed, 92 insertions(+), 17 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h b/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h index 5f14d7781004b3..3707273e0cbd48 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h @@ -20,12 +20,14 @@ class FirOpBuilder; namespace fir::runtime { -/// Generate call to general `LboundDim` runtime routine. Calls to LBOUND -/// without a DIM argument get transformed into descriptor inquiries so they're -/// not handled in the runtime. +/// Generate call to `LboundDim` runtime routine. mlir::Value genLboundDim(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value array, mlir::Value dim); +/// Generate call to Lbound` runtime routine. +void genLbound(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value resultAddr, mlir::Value arrayt, mlir::Value kind); + /// Generate call to general `Ubound` runtime routine. Calls to UBOUND /// with a DIM argument get transformed into an expression equivalent to /// SIZE() + LBOUND() - 1, so they don't have an intrinsic in the runtime. diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index c3ef96956be1c5..4cdf1f2d98caa4 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6362,16 +6362,17 @@ IntrinsicLibrary::genLbound(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2 || args.size() == 3); const fir::ExtendedValue &array = args[0]; - if (const auto *boxValue = array.getBoxOf()) - if (boxValue->hasAssumedRank()) - TODO(loc, "intrinsic: lbound with assumed rank argument"); + // Semantics builds signatures for LBOUND calls as either + // LBOUND(array, dim, [kind]) or LBOUND(array, [kind]). + const bool dimIsAbsent = args.size() == 2 || isStaticallyAbsent(args, 1); + if (array.hasAssumedRank() && dimIsAbsent) + return genAssumedRankBoundInquiry(builder, loc, resultType, args, + /*kindPos=*/1, fir::runtime::genLbound); mlir::Type indexType = builder.getIndexType(); - // Semantics builds signatures for LBOUND calls as either - // LBOUND(array, dim, [kind]) or LBOUND(array, [kind]). - if (args.size() == 2 || isStaticallyAbsent(args, 1)) { - // DIM is absent. + if (dimIsAbsent) { + // DIM is absent and the rank of array is a compile time constant. mlir::Type lbType = fir::unwrapSequenceType(resultType); unsigned rank = array.rank(); mlir::Type lbArrayType = fir::SequenceType::get( @@ -6396,13 +6397,16 @@ IntrinsicLibrary::genLbound(mlir::Type resultType, // DIM is present. mlir::Value dim = fir::getBase(args[1]); - // If it is a compile time constant, skip the runtime call. - if (std::optional cstDim = fir::getIntIfConstant(dim)) { - mlir::Value one = builder.createIntegerConstant(loc, resultType, 1); - mlir::Value zero = builder.createIntegerConstant(loc, indexType, 0); - mlir::Value lb = computeLBOUND(builder, loc, array, *cstDim - 1, zero, one); - return builder.createConvert(loc, resultType, lb); - } + // If it is a compile time constant and the rank is known, skip the runtime + // call. + if (!array.hasAssumedRank()) + if (std::optional cstDim = fir::getIntIfConstant(dim)) { + mlir::Value one = builder.createIntegerConstant(loc, resultType, 1); + mlir::Value zero = builder.createIntegerConstant(loc, indexType, 0); + mlir::Value lb = + computeLBOUND(builder, loc, array, *cstDim - 1, zero, one); + return builder.createConvert(loc, resultType, lb); + } fir::ExtendedValue box = createBoxForRuntimeBoundInquiry(loc, builder, array); return builder.createConvert( diff --git a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp index 34c4020b5907c9..e01a6f05b5fdd8 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp @@ -29,6 +29,20 @@ mlir::Value fir::runtime::genLboundDim(fir::FirOpBuilder &builder, return builder.create(loc, lboundFunc, args).getResult(0); } +void fir::runtime::genLbound(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value resultAddr, mlir::Value array, + mlir::Value kind) { + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc(loc, builder); + auto fTy = func.getFunctionType(); + auto sourceFile = fir::factory::locationToFilename(builder, loc); + auto sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(4)); + auto args = fir::runtime::createArguments( + builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine); + builder.create(loc, func, args).getResult(0); +} + /// Generate call to `Ubound` runtime routine. Calls to UBOUND with a DIM /// argument get transformed into an expression equivalent to /// SIZE() + LBOUND() - 1, so they don't have an intrinsic in the runtime. diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 index bbeff5ff051915..e568b94f4f8843 100644 --- a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 +++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 @@ -54,3 +54,58 @@ subroutine test_shape_2(x) ! CHECK: %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box>>) -> index ! CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_14]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) + + +subroutine test_lbound(x) + real :: x(..) + call takes_integer_array(lbound(x)) +end subroutine +! CHECK-LABEL: func.func @_QPtest_lbound( +! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.array<15xi32> +! CHECK: %[[VAL_4:.*]] = arith.constant 4 : i32 +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.llvm_ptr +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box>) -> !fir.box +! CHECK: %[[VAL_10:.*]] = fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}}) +! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref> +! CHECK: %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box>) -> index +! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_13]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) +! CHECK: %[[VAL_15:.*]] = arith.constant false +! CHECK: %[[VAL_16:.*]] = hlfir.as_expr %[[VAL_14]]#0 move %[[VAL_15]] : (!fir.box>, i1) -> !hlfir.expr +! CHECK: %[[VAL_17:.*]]:3 = hlfir.associate %[[VAL_16]](%[[VAL_13]]) {adapt.valuebyref} : (!hlfir.expr, !fir.shape<1>) -> (!fir.box>, !fir.ref>, i1) +! CHECK: fir.call @_QPtakes_integer_array(%[[VAL_17]]#1) fastmath : (!fir.ref>) -> () +! CHECK: hlfir.end_associate %[[VAL_17]]#1, %[[VAL_17]]#2 : !fir.ref>, i1 +! CHECK: hlfir.destroy %[[VAL_16]] : !hlfir.expr +! CHECK: return +! CHECK: } + +subroutine test_lbound_kind(x) + real :: x(..) + call takes_integer8_array(lbound(x, kind=8)) +end subroutine +! CHECK-LABEL: func.func @_QPtest_lbound_kind( +! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.array<15xi64> +! CHECK: %[[VAL_4:.*]] = arith.constant 8 : i32 +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.llvm_ptr +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box>) -> !fir.box +! CHECK: %[[VAL_10:.*]] = fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}}) +! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref> +! CHECK: %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box>) -> index +! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_13]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) + +subroutine test_lbound_2(x) + real, pointer :: x(..) + call takes_integer_array(lbound(x)) +end subroutine +! CHECK-LABEL: func.func @_QPtest_lbound_2( +! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.array<15xi32> +! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3:.*]] : !fir.ref>>> +! CHECK: %[[VAL_5:.*]] = arith.constant 4 : i32 +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.llvm_ptr +! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box>>) -> !fir.box +! CHECK: %[[VAL_11:.*]] = fir.call @_FortranALbound(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}}) +! CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref> +! CHECK: %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box>>) -> index +! CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_14]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref>, !fir.shape<1>) -> (!fir.box>, !fir.ref>) From b64cf381a7cf3d04c1f9297862bc6b1f3b6da4cf Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Tue, 11 Jun 2024 08:11:46 +0100 Subject: [PATCH 30/82] [flang][debug] Support assumed shape arrays. (#94644) This PR generates dwarf to extract the information about the arrays from descriptor. The DWARF needs the offset of the fields like `lower_bound` and `extent`. The getComponentOffset has been added to calculate them which pushes the issue of host and target data size into getDescFieldTypeModel. As we use data layout now, some tests needed to be adjusted to have a dummy data layout to avoid failure. With this change in place, GDB is able show the assumed shape arrays correctly. subroutine ff(n, m, arr) integer n, m integer :: arr(:, :) print *, arr do i = 1, n do j = 1, m arr(j, i) = (i * 5) + j + 10 end do end do print *, arr end subroutine ff Breakpoint 1, ff (n=4, m=3, arr=...) at test1.f90:13 13 print *, arr (gdb) p arr $1 = ((6, 7, 8, 9) (11, 12, 13, 14) (16, 17, 18, 19)) (gdb) ptype arr type = integer (4,3) (gdb) c Continuing. 6 7 8 9 11 12 13 14 16 17 18 19 --- .../Optimizer/CodeGen/DescriptorModel.h | 24 ++-- flang/lib/Optimizer/CodeGen/TypeConverter.cpp | 2 +- .../Transforms/DebugTypeGenerator.cpp | 120 +++++++++++++++++- .../Optimizer/Transforms/DebugTypeGenerator.h | 12 ++ .../Integration/debug-assumed-shape-array.f90 | 13 ++ flang/test/Transforms/debug-90683.fir | 2 +- .../Transforms/debug-assumed-shape-array.fir | 16 +++ flang/test/Transforms/debug-complex-1.fir | 2 +- .../Transforms/debug-fixed-array-type.fir | 2 +- .../Transforms/debug-line-table-existing.fir | 2 +- .../Transforms/debug-line-table-inc-file.fir | 4 +- .../debug-line-table-inc-same-file.fir | 2 +- flang/test/Transforms/debug-line-table.fir | 2 +- flang/test/Transforms/debug-module-1.fir | 2 +- 14 files changed, 182 insertions(+), 23 deletions(-) rename flang/{lib => include/flang}/Optimizer/CodeGen/DescriptorModel.h (88%) create mode 100644 flang/test/Integration/debug-assumed-shape-array.f90 create mode 100644 flang/test/Transforms/debug-assumed-shape-array.fir diff --git a/flang/lib/Optimizer/CodeGen/DescriptorModel.h b/flang/include/flang/Optimizer/CodeGen/DescriptorModel.h similarity index 88% rename from flang/lib/Optimizer/CodeGen/DescriptorModel.h rename to flang/include/flang/Optimizer/CodeGen/DescriptorModel.h index ed35caef930149..ff0cf29e8073e6 100644 --- a/flang/lib/Optimizer/CodeGen/DescriptorModel.h +++ b/flang/include/flang/Optimizer/CodeGen/DescriptorModel.h @@ -35,73 +35,73 @@ using TypeBuilderFunc = mlir::Type (*)(mlir::MLIRContext *); /// Get the LLVM IR dialect model for building a particular C++ type, `T`. template -TypeBuilderFunc getModel(); +static TypeBuilderFunc getModel(); template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::LLVM::LLVMPointerType::get(context); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(unsigned) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(int) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(unsigned long) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(unsigned long long) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(long long) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(Fortran::ISO::CFI_rank_t) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(Fortran::ISO::CFI_type_t) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { return mlir::IntegerType::get(context, sizeof(long) * 8); }; } template <> -TypeBuilderFunc getModel() { +constexpr TypeBuilderFunc getModel() { return [](mlir::MLIRContext *context) -> mlir::Type { auto indexTy = getModel()(context); return mlir::LLVM::LLVMArrayType::get(indexTy, 3); }; } template <> -TypeBuilderFunc +constexpr TypeBuilderFunc getModel>() { return getModel(); } diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp index 07d3bd713ce45d..501a36f5b68ba6 100644 --- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp +++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp @@ -13,9 +13,9 @@ #define DEBUG_TYPE "flang-type-conversion" #include "flang/Optimizer/CodeGen/TypeConverter.h" -#include "DescriptorModel.h" #include "flang/Common/Fortran.h" #include "flang/Optimizer/Builder/Todo.h" // remove when TODO's are done +#include "flang/Optimizer/CodeGen/DescriptorModel.h" #include "flang/Optimizer/CodeGen/TBAABuilder.h" #include "flang/Optimizer/CodeGen/Target.h" #include "flang/Optimizer/Dialect/FIRType.h" diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp index a174f2c2bc4bfd..53745d10fe9e4d 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp @@ -13,15 +13,55 @@ #define DEBUG_TYPE "flang-debug-type-generator" #include "DebugTypeGenerator.h" +#include "flang/Optimizer/CodeGen/DescriptorModel.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" +#include "flang/Optimizer/Support/DataLayout.h" +#include "mlir/Pass/Pass.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/Support/Debug.h" namespace fir { +/// Calculate offset of any field in the descriptor. +template +std::uint64_t getComponentOffset(const mlir::DataLayout &dl, + mlir::MLIRContext *context, + mlir::Type llvmFieldType) { + static_assert(DescriptorField > 0 && DescriptorField < 10); + mlir::Type previousFieldType = + getDescFieldTypeModel()(context); + std::uint64_t previousOffset = + getComponentOffset(dl, context, previousFieldType); + std::uint64_t offset = previousOffset + dl.getTypeSize(previousFieldType); + std::uint64_t fieldAlignment = dl.getTypeABIAlignment(llvmFieldType); + return llvm::alignTo(offset, fieldAlignment); +} +template <> +std::uint64_t getComponentOffset<0>(const mlir::DataLayout &dl, + mlir::MLIRContext *context, + mlir::Type llvmFieldType) { + return 0; +} + DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m) : module(m), kindMapping(getKindMapping(m)) { LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n"); + + std::optional dl = + fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/true); + if (!dl) { + mlir::emitError(module.getLoc(), "Missing data layout attribute in module"); + return; + } + + mlir::MLIRContext *context = module.getContext(); + + // The debug information requires the offset of certain fields in the + // descriptors like lower_bound and extent for each dimension. + mlir::Type llvmDimsType = getDescFieldTypeModel()(context); + dimsOffset = getComponentOffset(*dl, context, llvmDimsType); + dimsSize = dl->getTypeSize(llvmDimsType); } static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context, @@ -37,10 +77,82 @@ static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) { llvm::dwarf::DW_ATE_signed); } +mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType( + fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool genAllocated, + bool genAssociated) { + + mlir::MLIRContext *context = module.getContext(); + // FIXME: Assumed rank arrays not supported yet + if (seqTy.hasUnknownShape()) + return genPlaceholderType(context); + + llvm::SmallVector ops; + auto addOp = [&](unsigned opc, llvm::ArrayRef vals) { + ops.push_back(mlir::LLVM::DIExpressionElemAttr::get(context, opc, vals)); + }; + + addOp(llvm::dwarf::DW_OP_push_object_address, {}); + addOp(llvm::dwarf::DW_OP_deref, {}); + + // dataLocation = *base_addr + mlir::LLVM::DIExpressionAttr dataLocation = + mlir::LLVM::DIExpressionAttr::get(context, ops); + addOp(llvm::dwarf::DW_OP_lit0, {}); + addOp(llvm::dwarf::DW_OP_ne, {}); + + // allocated = associated = (*base_addr != 0) + mlir::LLVM::DIExpressionAttr valid = + mlir::LLVM::DIExpressionAttr::get(context, ops); + mlir::LLVM::DIExpressionAttr associated = genAllocated ? valid : nullptr; + mlir::LLVM::DIExpressionAttr allocated = genAssociated ? valid : nullptr; + ops.clear(); + + llvm::SmallVector elements; + mlir::LLVM::DITypeAttr elemTy = + convertType(seqTy.getEleTy(), fileAttr, scope, loc); + unsigned offset = dimsOffset; + const unsigned indexSize = dimsSize / 3; + for ([[maybe_unused]] auto _ : seqTy.getShape()) { + // For each dimension, find the offset of count and lower bound in the + // descriptor and generate the dwarf expression to extract it. + // FIXME: If `indexSize` happens to be bigger than address size on the + // system then we may have to change 'DW_OP_deref' here. + addOp(llvm::dwarf::DW_OP_push_object_address, {}); + addOp(llvm::dwarf::DW_OP_plus_uconst, + {offset + (indexSize * kDimExtentPos)}); + addOp(llvm::dwarf::DW_OP_deref, {}); + // count[i] = *(base_addr + offset + (indexSize * kDimExtentPos)) + // where 'offset' is dimsOffset + (i * dimsSize) + mlir::LLVM::DIExpressionAttr countAttr = + mlir::LLVM::DIExpressionAttr::get(context, ops); + ops.clear(); + + addOp(llvm::dwarf::DW_OP_push_object_address, {}); + addOp(llvm::dwarf::DW_OP_plus_uconst, + {offset + (indexSize * kDimLowerBoundPos)}); + addOp(llvm::dwarf::DW_OP_deref, {}); + // lower_bound[i] = *(base_addr + offset + (indexSize * kDimLowerBoundPos)) + mlir::LLVM::DIExpressionAttr lowerAttr = + mlir::LLVM::DIExpressionAttr::get(context, ops); + ops.clear(); + + offset += dimsSize; + mlir::LLVM::DISubrangeAttr subrangeTy = mlir::LLVM::DISubrangeAttr::get( + context, nullptr, lowerAttr, countAttr, nullptr); + elements.push_back(subrangeTy); + } + return mlir::LLVM::DICompositeTypeAttr::get( + context, llvm::dwarf::DW_TAG_array_type, /*recursive id*/ {}, + /* name */ nullptr, /* file */ nullptr, /* line */ 0, + /* scope */ nullptr, elemTy, mlir::LLVM::DIFlags::Zero, + /* sizeInBits */ 0, /*alignInBits*/ 0, elements, dataLocation, + /* rank */ nullptr, allocated, associated); +} + mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType( fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, mlir::Location loc) { - mlir::MLIRContext *context = module.getContext(); // FIXME: Only fixed sizes arrays handled at the moment. if (seqTy.hasDynamicExtents()) @@ -112,6 +224,12 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr, bitWidth * 2, llvm::dwarf::DW_ATE_complex_float); } else if (auto seqTy = mlir::dyn_cast_or_null(Ty)) { return convertSequenceType(seqTy, fileAttr, scope, loc); + } else if (auto boxTy = mlir::dyn_cast_or_null(Ty)) { + auto elTy = boxTy.getElementType(); + if (auto seqTy = mlir::dyn_cast_or_null(elTy)) + return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, false, + false); + return genPlaceholderType(context); } else { // FIXME: These types are currently unhandled. We are generating a // placeholder type to allow us to test supported bits. diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h index 963c919d66825c..11515d11dfed63 100644 --- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h +++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h @@ -35,8 +35,20 @@ class DebugTypeGenerator { mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, mlir::Location loc); + + /// The 'genAllocated' is true when we want to generate 'allocated' field + /// in the DICompositeType. It is needed for the allocatable arrays. + /// Similarly, 'genAssociated' is used with 'pointer' type to generate + /// 'associated' field. + mlir::LLVM::DITypeAttr + convertBoxedSequenceType(fir::SequenceType seqTy, + mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope, mlir::Location loc, + bool genAllocated, bool genAssociated); mlir::ModuleOp module; KindMapping kindMapping; + std::uint64_t dimsSize; + std::uint64_t dimsOffset; }; } // namespace fir diff --git a/flang/test/Integration/debug-assumed-shape-array.f90 b/flang/test/Integration/debug-assumed-shape-array.f90 new file mode 100644 index 00000000000000..7b0801c12dba11 --- /dev/null +++ b/flang/test/Integration/debug-assumed-shape-array.f90 @@ -0,0 +1,13 @@ +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s + +subroutine ff(arr) + implicit none + integer :: arr(:, :) + return arr(1,1) +end subroutine ff + +! CHECK-DAG: !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref)) +! CHECK-DAG: ![[ELEMS]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]} +! CHECK-DAG: ![[ELEM1]] = !DISubrange(lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref), upperBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref)) +! CHECK-DAG: ![[ELEM2]] = !DISubrange(lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref), upperBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref)) + diff --git a/flang/test/Transforms/debug-90683.fir b/flang/test/Transforms/debug-90683.fir index 9da0e5347d3f8f..cc6929c10411f8 100644 --- a/flang/test/Transforms/debug-90683.fir +++ b/flang/test/Transforms/debug-90683.fir @@ -2,7 +2,7 @@ // This test checks that debug information for fir.real type works ok. -module attributes {} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @_QPfn1(%arg0: !fir.ref> {fir.bindc_name = "a"} ) { %0 = fir.declare %arg0 {uniq_name = "_QFfn1Ea"} : (!fir.ref>) -> !fir.ref> %1 = fir.alloca f32 {bindc_name = "abserror", uniq_name = "_QFfn1Eabserror"} diff --git a/flang/test/Transforms/debug-assumed-shape-array.fir b/flang/test/Transforms/debug-assumed-shape-array.fir new file mode 100644 index 00000000000000..00dec9b318c811 --- /dev/null +++ b/flang/test/Transforms/debug-assumed-shape-array.fir @@ -0,0 +1,16 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} { + func.func @ff_(%arg0: !fir.box> {fir.bindc_name = "arr"} ) { + %0 = fir.undefined !fir.dscope + %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFffEarr"} : (!fir.box>, !fir.dscope) -> !fir.box> loc(#loc1) + return + } loc(#loc2) +} +#loc1 = loc("test1.f90":1:1) +#loc2 = loc("test1.f90":3:16) + +// CHECK: #llvm.di_composite_type, upperBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>> +// CHECK-SAME: #llvm.di_subrange, upperBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>> +// CHECK-SAME: dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>> diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir index a3cbd767d8a58e..cc742d3b183bbf 100644 --- a/flang/test/Transforms/debug-complex-1.fir +++ b/flang/test/Transforms/debug-complex-1.fir @@ -3,7 +3,7 @@ // check conversion of complex type of different size. Both fir and mlir // variants are checked. -module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @test1(%x : !fir.complex<4>) -> !fir.complex<8> { %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<8> return %1 : !fir.complex<8> diff --git a/flang/test/Transforms/debug-fixed-array-type.fir b/flang/test/Transforms/debug-fixed-array-type.fir index 401c725411831e..d4ed0b97020898 100644 --- a/flang/test/Transforms/debug-fixed-array-type.fir +++ b/flang/test/Transforms/debug-fixed-array-type.fir @@ -1,6 +1,6 @@ // RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s -module attributes {} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @_QQmain() attributes {fir.bindc_name = "mn"} { %c7 = arith.constant 7 : index %c8 = arith.constant 8 : index diff --git a/flang/test/Transforms/debug-line-table-existing.fir b/flang/test/Transforms/debug-line-table-existing.fir index 534278ebc972d3..0e006303c8a81d 100644 --- a/flang/test/Transforms/debug-line-table-existing.fir +++ b/flang/test/Transforms/debug-line-table-existing.fir @@ -3,7 +3,7 @@ // REQUIRES: system-linux // Test that there are no changes to a function with existed fused loc debug -module attributes {} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @_QPs1() { return loc(#loc1) } loc(#loc2) diff --git a/flang/test/Transforms/debug-line-table-inc-file.fir b/flang/test/Transforms/debug-line-table-inc-file.fir index 9370c138fd42ff..065039b59c5ae8 100644 --- a/flang/test/Transforms/debug-line-table-inc-file.fir +++ b/flang/test/Transforms/debug-line-table-inc-file.fir @@ -3,7 +3,7 @@ // REQUIRES: system-linux // Test for included functions that have a different debug location than the current file -module attributes {} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @_QPsinc() { return loc(#loc2) } loc(#loc1) @@ -19,7 +19,7 @@ module attributes {} { #loc4 = loc("/home/user01/llvm-project/build_release/simple.f90":4:3) #loc5 = loc("/home/user01/llvm-project/build_release/simple.f90":5:1) -// CHECK: module { +// CHECK: module // CHECK: func.func @_QPsinc() { // CHECK: } loc(#[[FUSED_LOC_INC_FILE:.*]]) // CHECK: func.func @_QQmain() { diff --git a/flang/test/Transforms/debug-line-table-inc-same-file.fir b/flang/test/Transforms/debug-line-table-inc-same-file.fir index 4836f2e21dd9db..bcaf4497982310 100644 --- a/flang/test/Transforms/debug-line-table-inc-same-file.fir +++ b/flang/test/Transforms/debug-line-table-inc-same-file.fir @@ -4,7 +4,7 @@ // Test that there is only one FileAttribute generated for multiple functions // in the same file. -module attributes {} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @_QPs1() { return loc(#loc2) } loc(#loc1) diff --git a/flang/test/Transforms/debug-line-table.fir b/flang/test/Transforms/debug-line-table.fir index 8a72ca2a856a70..d6e54fd1ac467e 100644 --- a/flang/test/Transforms/debug-line-table.fir +++ b/flang/test/Transforms/debug-line-table.fir @@ -3,7 +3,7 @@ // RUN: fir-opt --add-debug-info="debug-level=LineTablesOnly" --mlir-print-debuginfo %s | FileCheck %s --check-prefix=LINETABLE // RUN: fir-opt --add-debug-info="is-optimized=true" --mlir-print-debuginfo %s | FileCheck %s --check-prefix=OPT -module attributes { fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", llvm.target_triple = "aarch64-unknown-linux-gnu"} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { func.func @_QPsb() { return loc(#loc_sb) } loc(#loc_sb) diff --git a/flang/test/Transforms/debug-module-1.fir b/flang/test/Transforms/debug-module-1.fir index 822ae01b99aa78..71457d32b15960 100644 --- a/flang/test/Transforms/debug-module-1.fir +++ b/flang/test/Transforms/debug-module-1.fir @@ -1,7 +1,7 @@ // RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s -module attributes {} { +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { fir.global @_QMhelperEgli : i32 { %0 = fir.zero_bits i32 fir.has_value %0 : i32 From baad1b85b93c0b5ce0341668259ae21911bed8b1 Mon Sep 17 00:00:00 2001 From: Shivam Gupta Date: Tue, 11 Jun 2024 12:52:32 +0530 Subject: [PATCH 31/82] [lldb] Add a test for lea_rsp_pattern_p to x86 unwinder (NFC) (#94852) This commit adds a test for lea_rsp_pattern_p which was previously due as FIXME. --- .../x86/Testx86AssemblyInspectionEngine.cpp | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp index 277cc14ce50c9f..597e5b2e40d5e0 100644 --- a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp +++ b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp @@ -1731,7 +1731,29 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) { EXPECT_EQ(4 - 16, row_sp->GetCFAValue().GetOffset()); } -// FIXME add test for lea_rsp_pattern_p +TEST_F(Testx86AssemblyInspectionEngine, TestLEA_RSP_Pattern) { + UnwindPlan::Row::RegisterLocation regloc; + UnwindPlan::RowSP row_sp; + AddressRange sample_range; + UnwindPlan unwind_plan(eRegisterKindLLDB); + std::unique_ptr engine = Getx86_64Inspector(); + + uint8_t data[] = { + 0x8d, 0x64, 0x24, 0x10, // lea rsp, [rsp + 0x10] + 0x90 // nop + }; + + sample_range = AddressRange(0x1000, sizeof(data)); + + EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly( + data, sizeof(data), sample_range, unwind_plan)); + + row_sp = unwind_plan.GetRowForFunctionOffset(0); + EXPECT_EQ(0ull, row_sp->GetOffset()); + EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp); + EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true); + EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset()); +} TEST_F(Testx86AssemblyInspectionEngine, TestPopRBX) { UnwindPlan::Row::RegisterLocation regloc; From bd9a525efdab2a83cb24773d95ce8c4a2e9cce68 Mon Sep 17 00:00:00 2001 From: Mikael Holmen Date: Tue, 11 Jun 2024 09:30:15 +0200 Subject: [PATCH 32/82] [AArch64] Fix gcc "enumeral and non-enumeral type" warning Without this gcc (9.3.0) warns with ../lib/Target/AArch64/GISel/AArch64CallLowering.cpp: In function 'unsigned int getCallOpcode(const llvm::MachineFunction&, bool, bool, std::optional&, llvm::MachineRegisterInfo&)': ../lib/Target/AArch64/GISel/AArch64CallLowering.cpp:1025: warning: enumeral and non-enumeral type in conditional expression [-Wextra] 1025 | return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL; | --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 270474f80767aa..322bde3da67631 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -1022,7 +1022,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, if (!IsTailCall) { if (!PAI) - return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL; + return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL; assert(IsIndirect && "Direct call should not be authenticated"); assert((PAI->Key == AArch64PACKey::IA || PAI->Key == AArch64PACKey::IB) && From b1fe03f0840a2c488b1f07a669bfea3cc986ce3b Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Tue, 11 Jun 2024 13:16:51 +0530 Subject: [PATCH 33/82] [APFloat] Add APFloat support for FP6 data types (#94735) This patch adds APFloat type support for two FP6 data types, E2M3 and E3M2. The definitions for the two formats are detailed in section 5.3.2 of the OCP specification, which can be accessed here: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf Signed-off-by: Durgadoss R --- clang/lib/AST/MicrosoftMangle.cpp | 2 + llvm/include/llvm/ADT/APFloat.h | 25 ++ llvm/lib/Support/APFloat.cpp | 87 +++++- llvm/unittests/ADT/APFloatTest.cpp | 484 +++++++++++++++++++++++++++-- 4 files changed, 563 insertions(+), 35 deletions(-) diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 2f7a2763639207..ffc5d2d4cd8fc3 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) { case APFloat::S_Float8E4M3FNUZ: case APFloat::S_Float8E4M3B11FNUZ: case APFloat::S_FloatTF32: + case APFloat::S_Float6E3M2FN: + case APFloat::S_Float6E2M3FN: llvm_unreachable("Tried to mangle unexpected APFloat semantics"); } diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index 78faadb30d9eb5..a9bb6cc9999b1e 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -189,6 +189,14 @@ struct APFloatBase { // improved range compared to half (16-bit) formats, at (potentially) // greater throughput than single precision (32-bit) formats. S_FloatTF32, + // 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754 + // types, there are no infinity or NaN values. The format is detailed in + // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf + S_Float6E3M2FN, + // 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754 + // types, there are no infinity or NaN values. The format is detailed in + // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf + S_Float6E2M3FN, S_x87DoubleExtended, S_MaxSemantics = S_x87DoubleExtended, @@ -209,6 +217,8 @@ struct APFloatBase { static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE; static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE; static const fltSemantics &FloatTF32() LLVM_READNONE; + static const fltSemantics &Float6E3M2FN() LLVM_READNONE; + static const fltSemantics &Float6E2M3FN() LLVM_READNONE; static const fltSemantics &x87DoubleExtended() LLVM_READNONE; /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with @@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase { APInt convertFloat8E4M3FNUZAPFloatToAPInt() const; APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const; APInt convertFloatTF32APFloatToAPInt() const; + APInt convertFloat6E3M2FNAPFloatToAPInt() const; + APInt convertFloat6E2M3FNAPFloatToAPInt() const; void initFromAPInt(const fltSemantics *Sem, const APInt &api); template void initFromIEEEAPInt(const APInt &api); void initFromHalfAPInt(const APInt &api); @@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase { void initFromFloat8E4M3FNUZAPInt(const APInt &api); void initFromFloat8E4M3B11FNUZAPInt(const APInt &api); void initFromFloatTF32APInt(const APInt &api); + void initFromFloat6E3M2FNAPInt(const APInt &api); + void initFromFloat6E2M3FNAPInt(const APInt &api); void assign(const IEEEFloat &); void copySignificand(const IEEEFloat &); @@ -1046,6 +1060,17 @@ class APFloat : public APFloatBase { /// \param Semantics - type float semantics static APFloat getAllOnesValue(const fltSemantics &Semantics); + static bool hasNanOrInf(const fltSemantics &Sem) { + switch (SemanticsToEnum(Sem)) { + default: + return true; + // Below Semantics do not support {NaN or Inf} + case APFloat::S_Float6E3M2FN: + case APFloat::S_Float6E2M3FN: + return false; + } + } + /// Used to insert APFloat objects, or objects that contain APFloat objects, /// into FoldingSets. void Profile(FoldingSetNodeID &NID) const; diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 283fcc153b33aa..1209bf71a287d7 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior { // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available // encodings do not distinguish between signalling and quiet NaN. NanOnly, + + // This behavior is present in Float6E3M2FN and Float6E2M3FN types, + // which do not support Inf or NaN values. + FiniteOnly, }; // How NaN values are represented. This is curently only used in combination @@ -139,6 +143,10 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = { static constexpr fltSemantics semFloat8E4M3B11FNUZ = { 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19}; +static constexpr fltSemantics semFloat6E3M2FN = { + 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly}; +static constexpr fltSemantics semFloat6E2M3FN = { + 2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly}; static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80}; static constexpr fltSemantics semBogus = {0, 0, 0, 0}; @@ -206,6 +214,10 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) { return Float8E4M3B11FNUZ(); case S_FloatTF32: return FloatTF32(); + case S_Float6E3M2FN: + return Float6E3M2FN(); + case S_Float6E2M3FN: + return Float6E2M3FN(); case S_x87DoubleExtended: return x87DoubleExtended(); } @@ -238,6 +250,10 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) { return S_Float8E4M3B11FNUZ; else if (&Sem == &llvm::APFloat::FloatTF32()) return S_FloatTF32; + else if (&Sem == &llvm::APFloat::Float6E3M2FN()) + return S_Float6E3M2FN; + else if (&Sem == &llvm::APFloat::Float6E2M3FN()) + return S_Float6E2M3FN; else if (&Sem == &llvm::APFloat::x87DoubleExtended()) return S_x87DoubleExtended; else @@ -260,6 +276,8 @@ const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() { return semFloat8E4M3B11FNUZ; } const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; } +const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; } +const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; } const fltSemantics &APFloatBase::x87DoubleExtended() { return semX87DoubleExtended; } @@ -878,6 +896,9 @@ void IEEEFloat::copySignificand(const IEEEFloat &rhs) { for the significand. If double or longer, this is a signalling NaN, which may not be ideal. If float, this is QNaN(0). */ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly) + llvm_unreachable("This floating point format does not support NaN"); + category = fcNaN; sign = Negative; exponent = exponentNaN(); @@ -1499,16 +1520,18 @@ static void tcSetLeastSignificantBits(APInt::WordType *dst, unsigned parts, /* Handle overflow. Sign is preserved. We either become infinity or the largest finite number. */ IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) { - /* Infinity? */ - if (rounding_mode == rmNearestTiesToEven || - rounding_mode == rmNearestTiesToAway || - (rounding_mode == rmTowardPositive && !sign) || - (rounding_mode == rmTowardNegative && sign)) { - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) - makeNaN(false, sign); - else - category = fcInfinity; - return (opStatus) (opOverflow | opInexact); + if (semantics->nonFiniteBehavior != fltNonfiniteBehavior::FiniteOnly) { + /* Infinity? */ + if (rounding_mode == rmNearestTiesToEven || + rounding_mode == rmNearestTiesToAway || + (rounding_mode == rmTowardPositive && !sign) || + (rounding_mode == rmTowardNegative && sign)) { + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + makeNaN(false, sign); + else + category = fcInfinity; + return static_cast(opOverflow | opInexact); + } } /* Otherwise we become the largest finite number. */ @@ -3518,13 +3541,15 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const { myexponent = ::exponentZero(S) + bias; mysignificand.fill(0); } else if (category == fcInfinity) { - if (S.nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { + if (S.nonFiniteBehavior == fltNonfiniteBehavior::NanOnly || + S.nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly) llvm_unreachable("semantics don't support inf!"); - } myexponent = ::exponentInf(S) + bias; mysignificand.fill(0); } else { assert(category == fcNaN && "Unknown category!"); + if (S.nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly) + llvm_unreachable("semantics don't support NaN!"); myexponent = ::exponentNaN(S) + bias; std::copy_n(significandParts(), mysignificand.size(), mysignificand.begin()); @@ -3605,6 +3630,16 @@ APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const { return convertIEEEFloatToAPInt(); } +APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const { + assert(partCount() == 1); + return convertIEEEFloatToAPInt(); +} + +APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const { + assert(partCount() == 1); + return convertIEEEFloatToAPInt(); +} + // This function creates an APInt that is just a bit map of the floating // point constant as it would appear in memory. It is not a conversion, // and treating the result as a normal integer is unlikely to be useful. @@ -3646,6 +3681,12 @@ APInt IEEEFloat::bitcastToAPInt() const { if (semantics == (const llvm::fltSemantics *)&semFloatTF32) return convertFloatTF32APFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN) + return convertFloat6E3M2FNAPFloatToAPInt(); + + if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN) + return convertFloat6E2M3FNAPFloatToAPInt(); + assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended && "unknown format!"); return convertF80LongDoubleAPFloatToAPInt(); @@ -3862,6 +3903,14 @@ void IEEEFloat::initFromFloatTF32APInt(const APInt &api) { initFromIEEEAPInt(api); } +void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) { + initFromIEEEAPInt(api); +} + +void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) { + initFromIEEEAPInt(api); +} + /// Treat api as containing the bits of a floating point number. void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { assert(api.getBitWidth() == Sem->sizeInBits); @@ -3891,6 +3940,10 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { return initFromFloat8E4M3B11FNUZAPInt(api); if (Sem == &semFloatTF32) return initFromFloatTF32APInt(api); + if (Sem == &semFloat6E3M2FN) + return initFromFloat6E3M2FNAPInt(api); + if (Sem == &semFloat6E2M3FN) + return initFromFloat6E2M3FNAPInt(api); llvm_unreachable(nullptr); } @@ -4328,7 +4381,8 @@ int IEEEFloat::getExactLog2Abs() const { bool IEEEFloat::isSignaling() const { if (!isNaN()) return false; - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly || + semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly) return false; // IEEE-754R 2008 6.2.1: A signaling NaN bit string should be encoded with the @@ -4387,6 +4441,10 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) { // nextUp(getLargest()) == NAN makeNaN(); break; + } else if (semantics->nonFiniteBehavior == + fltNonfiniteBehavior::FiniteOnly) { + // nextUp(getLargest()) == getLargest() + break; } else { // nextUp(getLargest()) == INFINITY APInt::tcSet(significandParts(), 0, partCount()); @@ -4477,6 +4535,9 @@ APFloatBase::ExponentType IEEEFloat::exponentZero() const { } void IEEEFloat::makeInf(bool Negative) { + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly) + llvm_unreachable("This floating point format does not support Inf"); + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { // There is no Inf, so make NaN instead. makeNaN(false, Negative); diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index 6e4dda8351a1b1..7007d944801a75 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -723,11 +723,13 @@ TEST(APFloatTest, IsSmallestNormalized) { EXPECT_FALSE(APFloat::getZero(Semantics, false).isSmallestNormalized()); EXPECT_FALSE(APFloat::getZero(Semantics, true).isSmallestNormalized()); - EXPECT_FALSE(APFloat::getInf(Semantics, false).isSmallestNormalized()); - EXPECT_FALSE(APFloat::getInf(Semantics, true).isSmallestNormalized()); + if (APFloat::hasNanOrInf(Semantics)) { + EXPECT_FALSE(APFloat::getInf(Semantics, false).isSmallestNormalized()); + EXPECT_FALSE(APFloat::getInf(Semantics, true).isSmallestNormalized()); - EXPECT_FALSE(APFloat::getQNaN(Semantics).isSmallestNormalized()); - EXPECT_FALSE(APFloat::getSNaN(Semantics).isSmallestNormalized()); + EXPECT_FALSE(APFloat::getQNaN(Semantics).isSmallestNormalized()); + EXPECT_FALSE(APFloat::getSNaN(Semantics).isSmallestNormalized()); + } EXPECT_FALSE(APFloat::getLargest(Semantics).isSmallestNormalized()); EXPECT_FALSE(APFloat::getLargest(Semantics, true).isSmallestNormalized()); @@ -1823,6 +1825,9 @@ TEST(APFloatTest, getLargest) { 30, APFloat::getLargest(APFloat::Float8E4M3B11FNUZ()).convertToDouble()); EXPECT_EQ(3.40116213421e+38f, APFloat::getLargest(APFloat::FloatTF32()).convertToFloat()); + EXPECT_EQ(28, APFloat::getLargest(APFloat::Float6E3M2FN()).convertToDouble()); + EXPECT_EQ(7.5, + APFloat::getLargest(APFloat::Float6E2M3FN()).convertToDouble()); } TEST(APFloatTest, getSmallest) { @@ -1881,6 +1886,20 @@ TEST(APFloatTest, getSmallest) { EXPECT_TRUE(test.isFiniteNonZero()); EXPECT_TRUE(test.isDenormal()); EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + test = APFloat::getSmallest(APFloat::Float6E3M2FN(), false); + expected = APFloat(APFloat::Float6E3M2FN(), "0x0.1p0"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + test = APFloat::getSmallest(APFloat::Float6E2M3FN(), false); + expected = APFloat(APFloat::Float6E2M3FN(), "0x0.2p0"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); } TEST(APFloatTest, getSmallestNormalized) { @@ -1963,6 +1982,21 @@ TEST(APFloatTest, getSmallestNormalized) { EXPECT_FALSE(test.isDenormal()); EXPECT_TRUE(test.bitwiseIsEqual(expected)); EXPECT_TRUE(test.isSmallestNormalized()); + test = APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false); + expected = APFloat(APFloat::Float6E3M2FN(), "0x1p-2"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_FALSE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + EXPECT_TRUE(test.isSmallestNormalized()); + + test = APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false); + expected = APFloat(APFloat::Float6E2M3FN(), "0x1p0"); + EXPECT_FALSE(test.isNegative()); + EXPECT_TRUE(test.isFiniteNonZero()); + EXPECT_FALSE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + EXPECT_TRUE(test.isSmallestNormalized()); } TEST(APFloatTest, getZero) { @@ -1996,7 +2030,11 @@ TEST(APFloatTest, getZero) { {&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1}, {&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1}, {&APFloat::FloatTF32(), false, true, {0, 0}, 1}, - {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1}}; + {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1}, + {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1}, + {&APFloat::Float6E3M2FN(), true, true, {0x20ULL, 0}, 1}, + {&APFloat::Float6E2M3FN(), false, true, {0, 0}, 1}, + {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1}}; const unsigned NumGetZeroTests = std::size(GetZeroTest); for (unsigned i = 0; i < NumGetZeroTests; ++i) { APFloat test = APFloat::getZero(*GetZeroTest[i].semantics, @@ -5161,6 +5199,90 @@ TEST(APFloatTest, Float8ExhaustivePair) { } } +TEST(APFloatTest, Float6ExhaustivePair) { + // Test each pair of 6-bit floats with non-standard semantics + for (APFloat::Semantics Sem : + {APFloat::S_Float6E3M2FN, APFloat::S_Float6E2M3FN}) { + const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem); + for (int i = 1; i < 64; i++) { + for (int j = 1; j < 64; j++) { + SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) + + ",j=" + std::to_string(j)); + APFloat x(S, APInt(6, i)); + APFloat y(S, APInt(6, j)); + + bool losesInfo; + APFloat x16 = x; + x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_FALSE(losesInfo); + APFloat y16 = y; + y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_FALSE(losesInfo); + + // Add + APFloat z = x; + z.add(y, APFloat::rmNearestTiesToEven); + APFloat z16 = x16; + z16.add(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Subtract + z = x; + z.subtract(y, APFloat::rmNearestTiesToEven); + z16 = x16; + z16.subtract(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Multiply + z = x; + z.multiply(y, APFloat::rmNearestTiesToEven); + z16 = x16; + z16.multiply(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Skip divide by 0 + if (j == 0 || j == 32) + continue; + + // Divide + z = x; + z.divide(y, APFloat::rmNearestTiesToEven); + z16 = x16; + z16.divide(y16, APFloat::rmNearestTiesToEven); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Mod + z = x; + z.mod(y); + z16 = x16; + z16.mod(y16); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + + // Remainder + z = x; + z.remainder(y); + z16 = x16; + z16.remainder(y16); + z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_TRUE(z.bitwiseIsEqual(z16)) + << "sem=" << Sem << ", i=" << i << ", j=" << j; + } + } + } +} + TEST(APFloatTest, ConvertE4M3FNToE5M2) { bool losesInfo; APFloat test(APFloat::Float8E4M3FN(), "1.0"); @@ -6620,28 +6742,39 @@ TEST(APFloatTest, getExactLog2) { EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2()); EXPECT_EQ(INT_MIN, APFloat(Semantics, "3.0").getExactLog2Abs()); EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2Abs()); - EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2()); - EXPECT_EQ(INT_MIN, APFloat(Semantics, "-8.0").getExactLog2()); - EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2()); - EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2Abs()); - EXPECT_EQ(INT_MIN, APFloat(Semantics, "-0.25").getExactLog2()); - EXPECT_EQ(-2, APFloat(Semantics, "-0.25").getExactLog2Abs()); - EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2Abs()); - EXPECT_EQ(3, APFloat(Semantics, "-8.0").getExactLog2Abs()); + + if (I == APFloat::S_Float6E2M3FN) { + EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2()); + EXPECT_EQ(INT_MIN, APFloat(Semantics, "-4.0").getExactLog2()); + EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2Abs()); + EXPECT_EQ(2, APFloat(Semantics, "-4.0").getExactLog2Abs()); + } else { + EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2()); + EXPECT_EQ(INT_MIN, APFloat(Semantics, "-8.0").getExactLog2()); + EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2()); + EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2Abs()); + EXPECT_EQ(INT_MIN, APFloat(Semantics, "-0.25").getExactLog2()); + EXPECT_EQ(-2, APFloat(Semantics, "-0.25").getExactLog2Abs()); + EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2Abs()); + EXPECT_EQ(3, APFloat(Semantics, "-8.0").getExactLog2Abs()); + } EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, false).getExactLog2()); EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, true).getExactLog2()); - EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2()); - EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2()); - EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2()); - EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2()); - EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, false).getExactLog2Abs()); EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, true).getExactLog2Abs()); - EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2Abs()); - EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2Abs()); - EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2Abs()); - EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2Abs()); + + if (APFloat::hasNanOrInf(Semantics)) { + EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2()); + EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2()); + EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2()); + EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2()); + + EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2Abs()); + EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2Abs()); + EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2Abs()); + EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2Abs()); + } EXPECT_EQ(INT_MIN, scalbn(One, MinExp - Precision - 1, APFloat::rmNearestTiesToEven) @@ -6660,4 +6793,311 @@ TEST(APFloatTest, getExactLog2) { } } +TEST(APFloatTest, Float6E3M2FNFromString) { + // Exactly representable + EXPECT_EQ(28, APFloat(APFloat::Float6E3M2FN(), "28").convertToDouble()); + // Round down to maximum value + EXPECT_EQ(28, APFloat(APFloat::Float6E3M2FN(), "32").convertToDouble()); + +#ifdef GTEST_HAS_DEATH_TEST +#ifndef NDEBUG + EXPECT_DEATH(APFloat(APFloat::Float6E3M2FN(), "inf"), + "This floating point format does not support Inf"); + EXPECT_DEATH(APFloat(APFloat::Float6E3M2FN(), "nan"), + "This floating point format does not support NaN"); +#endif +#endif + + EXPECT_TRUE(APFloat(APFloat::Float6E3M2FN(), "0").isPosZero()); + EXPECT_TRUE(APFloat(APFloat::Float6E3M2FN(), "-0").isNegZero()); +} + +TEST(APFloatTest, Float6E2M3FNFromString) { + // Exactly representable + EXPECT_EQ(7.5, APFloat(APFloat::Float6E2M3FN(), "7.5").convertToDouble()); + // Round down to maximum value + EXPECT_EQ(7.5, APFloat(APFloat::Float6E2M3FN(), "32").convertToDouble()); + +#ifdef GTEST_HAS_DEATH_TEST +#ifndef NDEBUG + EXPECT_DEATH(APFloat(APFloat::Float6E2M3FN(), "inf"), + "This floating point format does not support Inf"); + EXPECT_DEATH(APFloat(APFloat::Float6E2M3FN(), "nan"), + "This floating point format does not support NaN"); +#endif +#endif + + EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "0").isPosZero()); + EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "-0").isNegZero()); +} + +TEST(APFloatTest, ConvertE3M2FToE2M3F) { + bool losesInfo; + APFloat test(APFloat::Float6E3M2FN(), "1.0"); + APFloat::opStatus status = test.convert( + APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_EQ(1.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + test = APFloat(APFloat::Float6E3M2FN(), "0.0"); + status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + // Test overflow + losesInfo = false; + test = APFloat(APFloat::Float6E3M2FN(), "28"); + status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(7.5f, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opInexact); + + // Test underflow + test = APFloat(APFloat::Float6E3M2FN(), ".0625"); + status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0., test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact); + + // Testing inexact rounding to denormal number + losesInfo = false; + test = APFloat(APFloat::Float6E3M2FN(), "0.1875"); + status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0.25, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact); +} + +TEST(APFloatTest, ConvertE2M3FToE3M2F) { + bool losesInfo; + APFloat test(APFloat::Float6E2M3FN(), "1.0"); + APFloat::opStatus status = test.convert( + APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven, &losesInfo); + EXPECT_EQ(1.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + test = APFloat(APFloat::Float6E2M3FN(), "0.0"); + status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(0.0f, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + test = APFloat(APFloat::Float6E2M3FN(), ".125"); + status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(.125, test.convertToFloat()); + EXPECT_FALSE(losesInfo); + EXPECT_EQ(status, APFloat::opOK); + + // Test inexact rounding + losesInfo = false; + test = APFloat(APFloat::Float6E2M3FN(), "7.5"); + status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven, + &losesInfo); + EXPECT_EQ(8, test.convertToFloat()); + EXPECT_TRUE(losesInfo); + EXPECT_EQ(status, APFloat::opInexact); +} + +TEST(APFloatTest, Float6E3M2FNNext) { + APFloat test(APFloat::Float6E3M2FN(), APFloat::uninitialized); + APFloat expected(APFloat::Float6E3M2FN(), APFloat::uninitialized); + + // 1. NextUp of largest bit pattern is the same + test = APFloat::getLargest(APFloat::Float6E3M2FN()); + expected = APFloat::getLargest(APFloat::Float6E3M2FN()); + EXPECT_EQ(test.next(false), APFloat::opOK); + EXPECT_FALSE(test.isInfinity()); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 2. NextUp of smallest negative denormal is -0 + test = APFloat::getSmallest(APFloat::Float6E3M2FN(), true); + expected = APFloat::getZero(APFloat::Float6E3M2FN(), true); + EXPECT_EQ(test.next(false), APFloat::opOK); + EXPECT_TRUE(test.isNegZero()); + EXPECT_FALSE(test.isPosZero()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 3. nextDown of negative of largest value is the same + test = APFloat::getLargest(APFloat::Float6E3M2FN(), true); + expected = test; + EXPECT_EQ(test.next(true), APFloat::opOK); + EXPECT_FALSE(test.isInfinity()); + EXPECT_FALSE(test.isZero()); + EXPECT_FALSE(test.isNaN()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 4. nextDown of +0 is smallest negative denormal + test = APFloat::getZero(APFloat::Float6E3M2FN(), false); + expected = APFloat::getSmallest(APFloat::Float6E3M2FN(), true); + EXPECT_EQ(test.next(true), APFloat::opOK); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); +} + +TEST(APFloatTest, Float6E2M3FNNext) { + APFloat test(APFloat::Float6E2M3FN(), APFloat::uninitialized); + APFloat expected(APFloat::Float6E2M3FN(), APFloat::uninitialized); + + // 1. NextUp of largest bit pattern is the same + test = APFloat::getLargest(APFloat::Float6E2M3FN()); + expected = APFloat::getLargest(APFloat::Float6E2M3FN()); + EXPECT_EQ(test.next(false), APFloat::opOK); + EXPECT_FALSE(test.isInfinity()); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 2. NextUp of smallest negative denormal is -0 + test = APFloat::getSmallest(APFloat::Float6E2M3FN(), true); + expected = APFloat::getZero(APFloat::Float6E2M3FN(), true); + EXPECT_EQ(test.next(false), APFloat::opOK); + EXPECT_TRUE(test.isNegZero()); + EXPECT_FALSE(test.isPosZero()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 3. nextDown of negative of largest value is the same + test = APFloat::getLargest(APFloat::Float6E2M3FN(), true); + expected = test; + EXPECT_EQ(test.next(true), APFloat::opOK); + EXPECT_FALSE(test.isInfinity()); + EXPECT_FALSE(test.isZero()); + EXPECT_FALSE(test.isNaN()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); + + // 4. nextDown of +0 is smallest negative denormal + test = APFloat::getZero(APFloat::Float6E2M3FN(), false); + expected = APFloat::getSmallest(APFloat::Float6E2M3FN(), true); + EXPECT_EQ(test.next(true), APFloat::opOK); + EXPECT_FALSE(test.isZero()); + EXPECT_TRUE(test.isDenormal()); + EXPECT_TRUE(test.bitwiseIsEqual(expected)); +} + +#ifdef GTEST_HAS_DEATH_TEST +#ifndef NDEBUG +TEST(APFloatTest, Float6E3M2FNGetInfNaN) { + EXPECT_DEATH(APFloat::getInf(APFloat::Float6E3M2FN()), + "This floating point format does not support Inf"); + EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E3M2FN()), + "This floating point format does not support NaN"); +} + +TEST(APFloatTest, Float6E2M3FNGetInfNaN) { + EXPECT_DEATH(APFloat::getInf(APFloat::Float6E2M3FN()), + "This floating point format does not support Inf"); + EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E2M3FN()), + "This floating point format does not support NaN"); +} +#endif +#endif + +TEST(APFloatTest, Float6E3M2FNToDouble) { + APFloat One(APFloat::Float6E3M2FN(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float6E3M2FN(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), false); + EXPECT_EQ(28., PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), true); + EXPECT_EQ(-28., NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false); + EXPECT_EQ(0x1p-2, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), true); + EXPECT_EQ(-0x1p-2, NegSmallest.convertToDouble()); + + APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E3M2FN(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x0.1p0, SmallestDenorm.convertToDouble()); +} + +TEST(APFloatTest, Float6E2M3FNToDouble) { + APFloat One(APFloat::Float6E2M3FN(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float6E2M3FN(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), false); + EXPECT_EQ(7.5, PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), true); + EXPECT_EQ(-7.5, NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false); + EXPECT_EQ(0x1p0, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), true); + EXPECT_EQ(-0x1p0, NegSmallest.convertToDouble()); + + APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E2M3FN(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToDouble()); +} + +TEST(APFloatTest, Float6E3M2FNToFloat) { + APFloat PosZero = APFloat::getZero(APFloat::Float6E3M2FN()); + APFloat PosZeroToFloat(PosZero.convertToFloat()); + EXPECT_TRUE(PosZeroToFloat.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float6E3M2FN(), true); + APFloat NegZeroToFloat(NegZero.convertToFloat()); + EXPECT_TRUE(NegZeroToFloat.isNegZero()); + + APFloat One(APFloat::Float6E3M2FN(), "1.0"); + EXPECT_EQ(1.0F, One.convertToFloat()); + APFloat Two(APFloat::Float6E3M2FN(), "2.0"); + EXPECT_EQ(2.0F, Two.convertToFloat()); + + APFloat PosLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), false); + EXPECT_EQ(28., PosLargest.convertToFloat()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), true); + EXPECT_EQ(-28, NegLargest.convertToFloat()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false); + EXPECT_EQ(0x1p-2, PosSmallest.convertToFloat()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), true); + EXPECT_EQ(-0x1p-2, NegSmallest.convertToFloat()); + + APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E3M2FN(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x0.1p0, SmallestDenorm.convertToFloat()); +} + +TEST(APFloatTest, Float6E2M3FNToFloat) { + APFloat PosZero = APFloat::getZero(APFloat::Float6E2M3FN()); + APFloat PosZeroToFloat(PosZero.convertToFloat()); + EXPECT_TRUE(PosZeroToFloat.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float6E2M3FN(), true); + APFloat NegZeroToFloat(NegZero.convertToFloat()); + EXPECT_TRUE(NegZeroToFloat.isNegZero()); + + APFloat One(APFloat::Float6E2M3FN(), "1.0"); + EXPECT_EQ(1.0F, One.convertToFloat()); + APFloat Two(APFloat::Float6E2M3FN(), "2.0"); + EXPECT_EQ(2.0F, Two.convertToFloat()); + + APFloat PosLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), false); + EXPECT_EQ(7.5, PosLargest.convertToFloat()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), true); + EXPECT_EQ(-7.5, NegLargest.convertToFloat()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false); + EXPECT_EQ(0x1p0, PosSmallest.convertToFloat()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), true); + EXPECT_EQ(-0x1p0, NegSmallest.convertToFloat()); + + APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E2M3FN(), false); + EXPECT_TRUE(SmallestDenorm.isDenormal()); + EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToFloat()); +} } // namespace From c63a622ba7547812939e2fd3dbfbe50e6cda2a42 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 11 Jun 2024 08:58:28 +0100 Subject: [PATCH 34/82] [AArch64] Disable red-zone when lowering Q-reg copy through memory. (#94962) This was pointed out in PR #93940. --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 10 +++++++++- llvm/test/CodeGen/AArch64/arm64-redzone.ll | 13 +++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index cd532671f50189..cf617c7e92a70a 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -431,8 +431,16 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); uint64_t NumBytes = AFI->getLocalStackSize(); + // If neither NEON or SVE are available, a COPY from one Q-reg to + // another requires a spill -> reload sequence. We can do that + // using a pre-decrementing store/post-decrementing load, but + // if we do so, we can't use the Red Zone. + bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() && + !Subtarget.isNeonAvailable() && + !Subtarget.hasSVE(); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || - getSVEStackSize(MF)); + getSVEStackSize(MF) || LowerQRegCopyThroughMem); } /// hasFP - Return true if the specified function should have a dedicated frame diff --git a/llvm/test/CodeGen/AArch64/arm64-redzone.ll b/llvm/test/CodeGen/AArch64/arm64-redzone.ll index fe30a1a98521e1..d001bc2a8dbe4e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-redzone.ll +++ b/llvm/test/CodeGen/AArch64/arm64-redzone.ll @@ -16,3 +16,16 @@ define i32 @foo(i32 %a, i32 %b) nounwind ssp { %tmp2 = load i32, ptr %x, align 4 ret i32 %tmp2 } + +; We disable red-zone if NEON is available because copies of Q-regs +; require a spill/fill and dynamic allocation. But we only need to do +; this when FP registers are enabled. +define void @bar(fp128 %f) "target-features"="-fp-armv8" { +; CHECK-LABEL: bar: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x0, x1, [sp, #-16] +; CHECK-NEXT: ret + %ptr = alloca fp128 + store fp128 %f, ptr %ptr + ret void +} From affd73aadb9611b8a10c8cced63ad4ae1a0a9161 Mon Sep 17 00:00:00 2001 From: Arjun P Date: Tue, 11 Jun 2024 09:43:56 +0100 Subject: [PATCH 35/82] MathExtras: MulOverflow: use builtin when available (NFC) (#95046) This matches the other two functions AddOverflow and SubOverflow. --- llvm/include/llvm/Support/MathExtras.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h index f0e4ee534ece38..adb5ba6e3dcc2f 100644 --- a/llvm/include/llvm/Support/MathExtras.h +++ b/llvm/include/llvm/Support/MathExtras.h @@ -635,6 +635,9 @@ std::enable_if_t, T> SubOverflow(T X, T Y, T &Result) { /// result, returning true if an overflow ocurred. template std::enable_if_t, T> MulOverflow(T X, T Y, T &Result) { +#if __has_builtin(__builtin_mul_overflow) + return __builtin_mul_overflow(X, Y, &Result); +#else // Perform the unsigned multiplication on absolute values. using U = std::make_unsigned_t; const U UX = X < 0 ? (0 - static_cast(X)) : static_cast(X); @@ -656,6 +659,7 @@ std::enable_if_t, T> MulOverflow(T X, T Y, T &Result) { return UX > (static_cast(std::numeric_limits::max()) + U(1)) / UY; else return UX > (static_cast(std::numeric_limits::max())) / UY; +#endif } } // namespace llvm From 79ce70b8033815b6abd3a9a5cc2335de70f1aaab Mon Sep 17 00:00:00 2001 From: Braden Helmer Date: Tue, 11 Jun 2024 04:54:00 -0400 Subject: [PATCH 36/82] [NFC] Mitigate pointless copies (#95052) Fixes #95036 #95033 #94933 #94930 --- llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h | 2 +- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +- llvm/lib/CodeGen/RegAllocGreedy.h | 2 +- llvm/lib/ObjCopy/ELF/ELFObject.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h index 2dcd7805b6c96b..dbb658940eef12 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h @@ -61,7 +61,7 @@ class DWARFFormValue { const DWARFUnit *U = nullptr; /// Remember the DWARFUnit at extract time. const DWARFContext *C = nullptr; /// Context for extract time. - DWARFFormValue(dwarf::Form F, ValueType V) : Form(F), Value(V) {} + DWARFFormValue(dwarf::Form F, const ValueType &V) : Form(F), Value(V) {} public: DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F) {} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index e54ec4f2b1d72d..bff49dab4a313d 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -198,7 +198,7 @@ struct TargetRegionEntryInfo { unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count); - bool operator<(const TargetRegionEntryInfo RHS) const { + bool operator<(const TargetRegionEntryInfo &RHS) const { return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) < std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line, RHS.Count); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index 1941643bba9e66..06cf0828ea79b3 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -425,7 +425,7 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass, ZeroCostFoldedReloads || Copies); } - void add(RAGreedyStats other) { + void add(const RAGreedyStats &other) { Reloads += other.Reloads; FoldedReloads += other.FoldedReloads; ZeroCostFoldedReloads += other.ZeroCostFoldedReloads; diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h index f72c109b6009e8..2b1895a30b41ed 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObject.h +++ b/llvm/lib/ObjCopy/ELF/ELFObject.h @@ -910,7 +910,7 @@ class RelocationSection public: RelocationSection(const Object &O) : Obj(O) {} - void addRelocation(Relocation Rel) { Relocations.push_back(Rel); } + void addRelocation(const Relocation &Rel) { Relocations.push_back(Rel); } Error accept(SectionVisitor &Visitor) const override; Error accept(MutableSectionVisitor &Visitor) override; Error removeSectionReferences( From 35ddc17f36282f24324275e0691fb57e270f113d Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 11 Jun 2024 10:04:12 +0100 Subject: [PATCH 37/82] [StackSafety] Make lit tests compatible with lit's internal shell. NFC. (#94971) Use DEFINE: %{res} = ... instead of $(cat ...). Rewrite one use of a subshell to write to a temporary file instead. --- .../Analysis/StackSafetyAnalysis/ipa-alias.ll | 39 +++-- llvm/test/Analysis/StackSafetyAnalysis/ipa.ll | 138 +++++++++--------- 2 files changed, 88 insertions(+), 89 deletions(-) diff --git a/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll b/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll index a4846dade4dbd5..7d2b2a3b25d573 100644 --- a/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll +++ b/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll @@ -1,5 +1,4 @@ ; REQUIRES: aarch64-registered-target -; REQUIRES: shell ; Test IPA over a single combined file ; RUN: llvm-as %s -o %t0.bc @@ -14,27 +13,27 @@ ; RUN: opt -module-summary %s -o %t.summ0.bc ; RUN: opt -module-summary %S/Inputs/ipa-alias.ll -o %t.summ1.bc -; RUN: echo > %t.res.txt \ -; RUN: -r %t.summ0.bc,AliasCall,px \ -; RUN: -r %t.summ0.bc,AliasToBitcastAliasWrite1, \ -; RUN: -r %t.summ0.bc,AliasToPreemptableAliasWrite1, \ -; RUN: -r %t.summ0.bc,AliasWrite1, \ -; RUN: -r %t.summ0.bc,BitcastAliasCall,px \ -; RUN: -r %t.summ0.bc,BitcastAliasWrite1, \ -; RUN: -r %t.summ0.bc,InterposableAliasCall,px \ -; RUN: -r %t.summ0.bc,InterposableAliasWrite1, \ -; RUN: -r %t.summ0.bc,PreemptableAliasCall,px \ -; RUN: -r %t.summ0.bc,PreemptableAliasWrite1, \ -; RUN: -r %t.summ1.bc,AliasToBitcastAliasWrite1,px \ -; RUN: -r %t.summ1.bc,AliasToPreemptableAliasWrite1,px \ -; RUN: -r %t.summ1.bc,AliasWrite1,px \ -; RUN: -r %t.summ1.bc,BitcastAliasWrite1,px \ -; RUN: -r %t.summ1.bc,InterposableAliasWrite1,px \ -; RUN: -r %t.summ1.bc,PreemptableAliasWrite1,px \ -; RUN: -r %t.summ1.bc,Write1,px +; DEFINE: %{res} = \ +; DEFINE: -r %t.summ0.bc,AliasCall,px \ +; DEFINE: -r %t.summ0.bc,AliasToBitcastAliasWrite1, \ +; DEFINE: -r %t.summ0.bc,AliasToPreemptableAliasWrite1, \ +; DEFINE: -r %t.summ0.bc,AliasWrite1, \ +; DEFINE: -r %t.summ0.bc,BitcastAliasCall,px \ +; DEFINE: -r %t.summ0.bc,BitcastAliasWrite1, \ +; DEFINE: -r %t.summ0.bc,InterposableAliasCall,px \ +; DEFINE: -r %t.summ0.bc,InterposableAliasWrite1, \ +; DEFINE: -r %t.summ0.bc,PreemptableAliasCall,px \ +; DEFINE: -r %t.summ0.bc,PreemptableAliasWrite1, \ +; DEFINE: -r %t.summ1.bc,AliasToBitcastAliasWrite1,px \ +; DEFINE: -r %t.summ1.bc,AliasToPreemptableAliasWrite1,px \ +; DEFINE: -r %t.summ1.bc,AliasWrite1,px \ +; DEFINE: -r %t.summ1.bc,BitcastAliasWrite1,px \ +; DEFINE: -r %t.summ1.bc,InterposableAliasWrite1,px \ +; DEFINE: -r %t.summ1.bc,PreemptableAliasWrite1,px \ +; DEFINE: -r %t.summ1.bc,Write1,px ; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-print -stack-safety-run -save-temps -thinlto-threads 1 -O0 \ -; RUN: $(cat %t.res.txt) \ +; RUN: %{res} \ ; RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL,LTO target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll b/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll index 8de56adc2bbd1c..0e2a08f3ae8df7 100644 --- a/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll +++ b/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll @@ -1,5 +1,4 @@ ; REQUIRES: aarch64-registered-target -; REQUIRES: shell ; RUN: llvm-as %s -o %t0.bc ; RUN: llvm-as %S/Inputs/ipa.ll -o %t1.bc @@ -17,78 +16,79 @@ ; RUN: llvm-dis %t.summ0.bc -o - > %t.ids.txt ; RUN: llvm-dis %t.summ1.bc -o - >> %t.ids.txt -; RUN: echo > %t.res.txt \ -; RUN: -r %t.summ0.bc,ExternalCall, \ -; RUN: -r %t.summ0.bc,f1,px \ -; RUN: -r %t.summ0.bc,f2,px \ -; RUN: -r %t.summ0.bc,f3,px \ -; RUN: -r %t.summ0.bc,f4,px \ -; RUN: -r %t.summ0.bc,f5,px \ -; RUN: -r %t.summ0.bc,f6,px \ -; RUN: -r %t.summ0.bc,f7,px \ -; RUN: -r %t.summ0.bc,f8left,px \ -; RUN: -r %t.summ0.bc,f8oobleft,px \ -; RUN: -r %t.summ0.bc,f8oobright,px \ -; RUN: -r %t.summ0.bc,f8right,px \ -; RUN: -r %t.summ0.bc,InterposableCall,px \ -; RUN: -r %t.summ0.bc,InterposableWrite1, \ -; RUN: -r %t.summ0.bc,PreemptableCall,px \ -; RUN: -r %t.summ0.bc,PreemptableWrite1, \ -; RUN: -r %t.summ0.bc,PrivateCall,px \ -; RUN: -r %t.summ0.bc,Rec2, \ -; RUN: -r %t.summ0.bc,RecursiveNoOffset, \ -; RUN: -r %t.summ0.bc,RecursiveWithOffset, \ -; RUN: -r %t.summ0.bc,ReturnDependent, \ -; RUN: -r %t.summ0.bc,TestCrossModuleConflict,px \ -; RUN: -r %t.summ0.bc,TestCrossModuleOnce,px \ -; RUN: -r %t.summ0.bc,TestCrossModuleTwice,px \ -; RUN: -r %t.summ0.bc,TestCrossModuleWeak,px \ -; RUN: -r %t.summ0.bc,TestRecursiveNoOffset,px \ -; RUN: -r %t.summ0.bc,TestRecursiveWithOffset,px \ -; RUN: -r %t.summ0.bc,TestUpdateArg,px \ -; RUN: -r %t.summ0.bc,TwoArguments,px \ -; RUN: -r %t.summ0.bc,TwoArgumentsOOBBoth,px \ -; RUN: -r %t.summ0.bc,TwoArgumentsOOBOne,px \ -; RUN: -r %t.summ0.bc,TwoArgumentsOOBOther,px \ -; RUN: -r %t.summ0.bc,Weak,x \ -; RUN: -r %t.summ0.bc,Write1, \ -; RUN: -r %t.summ0.bc,Write1DiffModule,x \ -; RUN: -r %t.summ0.bc,Write1Module0,px \ -; RUN: -r %t.summ0.bc,Write1Private,x \ -; RUN: -r %t.summ0.bc,Write1SameModule,x \ -; RUN: -r %t.summ0.bc,Write1Weak,x \ -; RUN: -r %t.summ0.bc,Write4_2, \ -; RUN: -r %t.summ0.bc,Write4, \ -; RUN: -r %t.summ0.bc,Write8, \ -; RUN: -r %t.summ0.bc,WriteAndReturn8, \ -; RUN: -r %t.summ1.bc,ExternalCall,px \ -; RUN: -r %t.summ1.bc,InterposableWrite1,px \ -; RUN: -r %t.summ1.bc,PreemptableWrite1,px \ -; RUN: -r %t.summ1.bc,Rec0,px \ -; RUN: -r %t.summ1.bc,Rec1,px \ -; RUN: -r %t.summ1.bc,Rec2,px \ -; RUN: -r %t.summ1.bc,RecursiveNoOffset,px \ -; RUN: -r %t.summ1.bc,RecursiveWithOffset,px \ -; RUN: -r %t.summ1.bc,ReturnAlloca,px \ -; RUN: -r %t.summ1.bc,ReturnDependent,px \ -; RUN: -r %t.summ1.bc,Weak,x \ -; RUN: -r %t.summ1.bc,Write1,px \ -; RUN: -r %t.summ1.bc,Write1DiffModule,px \ -; RUN: -r %t.summ1.bc,Write1Module0,x \ -; RUN: -r %t.summ1.bc,Write1Private,px \ -; RUN: -r %t.summ1.bc,Write1SameModule,px \ -; RUN: -r %t.summ1.bc,Write1Weak,px \ -; RUN: -r %t.summ1.bc,Write4_2,px \ -; RUN: -r %t.summ1.bc,Write4,px \ -; RUN: -r %t.summ1.bc,Write8,px \ -; RUN: -r %t.summ1.bc,WriteAndReturn8,px +; DEFINE: %{res} = \ +; DEFINE: -r %t.summ0.bc,ExternalCall, \ +; DEFINE: -r %t.summ0.bc,f1,px \ +; DEFINE: -r %t.summ0.bc,f2,px \ +; DEFINE: -r %t.summ0.bc,f3,px \ +; DEFINE: -r %t.summ0.bc,f4,px \ +; DEFINE: -r %t.summ0.bc,f5,px \ +; DEFINE: -r %t.summ0.bc,f6,px \ +; DEFINE: -r %t.summ0.bc,f7,px \ +; DEFINE: -r %t.summ0.bc,f8left,px \ +; DEFINE: -r %t.summ0.bc,f8oobleft,px \ +; DEFINE: -r %t.summ0.bc,f8oobright,px \ +; DEFINE: -r %t.summ0.bc,f8right,px \ +; DEFINE: -r %t.summ0.bc,InterposableCall,px \ +; DEFINE: -r %t.summ0.bc,InterposableWrite1, \ +; DEFINE: -r %t.summ0.bc,PreemptableCall,px \ +; DEFINE: -r %t.summ0.bc,PreemptableWrite1, \ +; DEFINE: -r %t.summ0.bc,PrivateCall,px \ +; DEFINE: -r %t.summ0.bc,Rec2, \ +; DEFINE: -r %t.summ0.bc,RecursiveNoOffset, \ +; DEFINE: -r %t.summ0.bc,RecursiveWithOffset, \ +; DEFINE: -r %t.summ0.bc,ReturnDependent, \ +; DEFINE: -r %t.summ0.bc,TestCrossModuleConflict,px \ +; DEFINE: -r %t.summ0.bc,TestCrossModuleOnce,px \ +; DEFINE: -r %t.summ0.bc,TestCrossModuleTwice,px \ +; DEFINE: -r %t.summ0.bc,TestCrossModuleWeak,px \ +; DEFINE: -r %t.summ0.bc,TestRecursiveNoOffset,px \ +; DEFINE: -r %t.summ0.bc,TestRecursiveWithOffset,px \ +; DEFINE: -r %t.summ0.bc,TestUpdateArg,px \ +; DEFINE: -r %t.summ0.bc,TwoArguments,px \ +; DEFINE: -r %t.summ0.bc,TwoArgumentsOOBBoth,px \ +; DEFINE: -r %t.summ0.bc,TwoArgumentsOOBOne,px \ +; DEFINE: -r %t.summ0.bc,TwoArgumentsOOBOther,px \ +; DEFINE: -r %t.summ0.bc,Weak,x \ +; DEFINE: -r %t.summ0.bc,Write1, \ +; DEFINE: -r %t.summ0.bc,Write1DiffModule,x \ +; DEFINE: -r %t.summ0.bc,Write1Module0,px \ +; DEFINE: -r %t.summ0.bc,Write1Private,x \ +; DEFINE: -r %t.summ0.bc,Write1SameModule,x \ +; DEFINE: -r %t.summ0.bc,Write1Weak,x \ +; DEFINE: -r %t.summ0.bc,Write4_2, \ +; DEFINE: -r %t.summ0.bc,Write4, \ +; DEFINE: -r %t.summ0.bc,Write8, \ +; DEFINE: -r %t.summ0.bc,WriteAndReturn8, \ +; DEFINE: -r %t.summ1.bc,ExternalCall,px \ +; DEFINE: -r %t.summ1.bc,InterposableWrite1,px \ +; DEFINE: -r %t.summ1.bc,PreemptableWrite1,px \ +; DEFINE: -r %t.summ1.bc,Rec0,px \ +; DEFINE: -r %t.summ1.bc,Rec1,px \ +; DEFINE: -r %t.summ1.bc,Rec2,px \ +; DEFINE: -r %t.summ1.bc,RecursiveNoOffset,px \ +; DEFINE: -r %t.summ1.bc,RecursiveWithOffset,px \ +; DEFINE: -r %t.summ1.bc,ReturnAlloca,px \ +; DEFINE: -r %t.summ1.bc,ReturnDependent,px \ +; DEFINE: -r %t.summ1.bc,Weak,x \ +; DEFINE: -r %t.summ1.bc,Write1,px \ +; DEFINE: -r %t.summ1.bc,Write1DiffModule,px \ +; DEFINE: -r %t.summ1.bc,Write1Module0,x \ +; DEFINE: -r %t.summ1.bc,Write1Private,px \ +; DEFINE: -r %t.summ1.bc,Write1SameModule,px \ +; DEFINE: -r %t.summ1.bc,Write1Weak,px \ +; DEFINE: -r %t.summ1.bc,Write4_2,px \ +; DEFINE: -r %t.summ1.bc,Write4,px \ +; DEFINE: -r %t.summ1.bc,Write8,px \ +; DEFINE: -r %t.summ1.bc,WriteAndReturn8,px ; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-print -stack-safety-run -save-temps -thinlto-threads 1 -O0 \ -; RUN: $(cat %t.res.txt) \ +; RUN: %{res} \ ; RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL,LTO -; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-run -thinlto-distributed-indexes -thinlto-threads 1 -O0 $(cat %t.res.txt) -; RUN: (cat %t.ids.txt ; llvm-dis %t.summ1.bc.thinlto.bc -o -) | FileCheck --check-prefixes=INDEX %s +; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-run -thinlto-distributed-indexes -thinlto-threads 1 -O0 %{res} +; RUN: llvm-dis %t.summ1.bc.thinlto.bc -o - >> %t.ids.txt +; RUN: FileCheck --check-prefixes=INDEX %s < %t.ids.txt target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux" From 529b43c1fd435e1544bbd581fcf7191c3edfa20e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 11 Jun 2024 06:14:17 +0200 Subject: [PATCH 38/82] [clang][Interp] Refine diagnostics for casts from void* This is still not perfect, but an improvement in general. --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 14 ++++++++++---- clang/lib/AST/Interp/Interp.h | 23 +++++++++++++++++++---- clang/lib/AST/Interp/Opcodes.td | 5 ++++- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 6654a27c921689..0899a98b3b95a6 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -318,7 +318,8 @@ bool ByteCodeExprGen::VisitCastExpr(const CastExpr *CE) { if (DiscardResult) return this->discard(SubExpr); - std::optional FromT = classify(SubExpr->getType()); + QualType SubExprTy = SubExpr->getType(); + std::optional FromT = classify(SubExprTy); std::optional ToT = classify(CE->getType()); if (!FromT || !ToT) return false; @@ -326,9 +327,14 @@ bool ByteCodeExprGen::VisitCastExpr(const CastExpr *CE) { assert(isPtrType(*FromT)); assert(isPtrType(*ToT)); if (FromT == ToT) { - if (SubExpr->getType()->isVoidPointerType()) - return this->visit(SubExpr) && this->emitVoidPtrCast(CE); - return this->delegate(SubExpr); + if (CE->getType()->isVoidPointerType()) + return this->delegate(SubExpr); + + if (!this->visit(SubExpr)) + return false; + if (FromT == PT_Ptr) + return this->emitPtrPtrCast(SubExprTy->isVoidPointerType(), CE); + return true; } if (!this->visit(SubExpr)) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 0ad710c5ec1afc..784e138e1467d4 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -1980,10 +1980,25 @@ static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, return true; } -static inline bool VoidPtrCast(InterpState &S, CodePtr OpPC) { - const SourceInfo &E = S.Current->getSource(OpPC); - S.CCEDiag(E, diag::note_constexpr_invalid_cast) - << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC); +static inline bool PtrPtrCast(InterpState &S, CodePtr OpPC, bool SrcIsVoidPtr) { + const auto &Ptr = S.Stk.peek(); + + if (SrcIsVoidPtr && S.getLangOpts().CPlusPlus) { + bool HasValidResult = !Ptr.isZero(); + + if (HasValidResult) { + // FIXME: note_constexpr_invalid_void_star_cast + } else if (!S.getLangOpts().CPlusPlus26) { + const SourceInfo &E = S.Current->getSource(OpPC); + S.CCEDiag(E, diag::note_constexpr_invalid_cast) + << 3 << "'void *'" << S.Current->getRange(OpPC); + } + } else { + const SourceInfo &E = S.Current->getSource(OpPC); + S.CCEDiag(E, diag::note_constexpr_invalid_cast) + << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC); + } + return true; } diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index ac5426c87c2123..45fc11e5645767 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -665,7 +665,10 @@ def CastPointerIntegralAPS : Opcode { let HasGroup = 0; let Args = [ArgUint32]; } -def VoidPtrCast : Opcode; +def PtrPtrCast : Opcode { + let Args = [ArgBool]; + +} def DecayPtr : Opcode { let Types = [PtrTypeClass, PtrTypeClass]; From ec81c9b1dd856dbe77b042e7d293e437b587d914 Mon Sep 17 00:00:00 2001 From: Pavel Samolysov Date: Tue, 11 Jun 2024 12:24:46 +0300 Subject: [PATCH 39/82] [clang] Remove a redundant check in Mangle. NFC (#95071) This addresses a review comment for PR #94987 Because that PR is a big automatic change, this change was moved in a separate one. --- clang/lib/AST/Mangle.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index 4af4d7c00c5cb3..4fbf0e3b42dbc8 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -301,9 +301,8 @@ void MangleContext::mangleBlock(const DeclContext *DC, const BlockDecl *BD, } else { assert((isa(DC) || isa(DC)) && "expected a NamedDecl or BlockDecl"); - if (isa(DC)) - for (; isa_and_nonnull(DC); DC = DC->getParent()) - (void) getBlockId(cast(DC), true); + for (; isa_and_nonnull(DC); DC = DC->getParent()) + (void)getBlockId(cast(DC), true); assert((isa(DC) || isa(DC)) && "expected a TranslationUnitDecl or a NamedDecl"); if (const auto *CD = dyn_cast(DC)) From 995ba4afcd0d913bca5e082afe90be78b2882f79 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 10 Jun 2024 21:55:42 +0100 Subject: [PATCH 40/82] [CostModel][X86] Adjust ABS scalar SizeLatency cost to 3uops This was previously set to 4uops which was including the cost of extra register moves in the original test code. --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 8 +- .../Analysis/CostModel/X86/abs-sizelatency.ll | 120 +++++++++--------- llvm/test/CodeGen/X86/abs.ll | 80 +++++------- 3 files changed, 98 insertions(+), 110 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0a23bf2516763a..74948778ccf853 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4061,7 +4061,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) }; static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV + { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV @@ -4082,9 +4082,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto }; static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets - { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV - { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV - { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA + { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV + { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV + { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } }, { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } }, { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, diff --git a/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll index 4498d499745aa8..5d41228fdce186 100644 --- a/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll @@ -14,42 +14,42 @@ define void @cost_abs_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { ; SSE-LABEL: 'cost_abs_i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) ; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) ; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) ; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i64' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i64' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i64' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false) @@ -64,56 +64,56 @@ define void @cost_abs_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> define void @cost_abs_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { ; SSE2-LABEL: 'cost_abs_i32' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'cost_abs_i32' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'cost_abs_i32' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i32' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i32' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i32' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false) @@ -128,56 +128,56 @@ define void @cost_abs_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> define void @cost_abs_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { ; SSE2-LABEL: 'cost_abs_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'cost_abs_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'cost_abs_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i16' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i16' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i16' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false) @@ -192,56 +192,56 @@ define void @cost_abs_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16 define void @cost_abs_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { ; SSE2-LABEL: 'cost_abs_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'cost_abs_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'cost_abs_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i8' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i8' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i8' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false) @@ -260,42 +260,42 @@ define void @cost_abs_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a5 define void @cost_abs_i64_poison(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) { ; SSE-LABEL: 'cost_abs_i64_poison' -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) ; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) ; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) ; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i64_poison' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i64_poison' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i64_poison' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i64_poison' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i64_poison' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true) @@ -310,56 +310,56 @@ define void @cost_abs_i64_poison(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 define void @cost_abs_i32_poison(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) { ; SSE2-LABEL: 'cost_abs_i32_poison' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'cost_abs_i32_poison' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'cost_abs_i32_poison' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i32_poison' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i32_poison' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i32_poison' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i32_poison' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i32_poison' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true) @@ -374,56 +374,56 @@ define void @cost_abs_i32_poison(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 define void @cost_abs_i16_poison(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) { ; SSE2-LABEL: 'cost_abs_i16_poison' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'cost_abs_i16_poison' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'cost_abs_i16_poison' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i16_poison' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i16_poison' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i16_poison' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i16_poison' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i16_poison' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true) @@ -438,56 +438,56 @@ define void @cost_abs_i16_poison(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 define void @cost_abs_i8_poison(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) { ; SSE2-LABEL: 'cost_abs_i8_poison' -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSSE3-LABEL: 'cost_abs_i8_poison' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SSE42-LABEL: 'cost_abs_i8_poison' -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX1-LABEL: 'cost_abs_i8_poison' -; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX2-LABEL: 'cost_abs_i8_poison' -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512F-LABEL: 'cost_abs_i8_poison' -; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'cost_abs_i8_poison' -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512BW-LABEL: 'cost_abs_i8_poison' -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true) diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index 7642c0c9541429..dde877c5bb61e5 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -713,17 +713,15 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind { define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind { ; X64-LABEL: test_minsigned_i8: ; X64: # %bb.0: -; X64-NEXT: cmpb $-128, %dil -; X64-NEXT: jne .LBB17_1 -; X64-NEXT: # %bb.2: # %select.end -; X64-NEXT: movl %esi, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB17_1: # %select.false.sink ; X64-NEXT: movl %edi, %eax ; X64-NEXT: sarb $7, %al -; X64-NEXT: xorb %al, %dil -; X64-NEXT: subb %al, %dil -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: xorb %al, %cl +; X64-NEXT: subb %al, %cl +; X64-NEXT: cmpb $-128, %dil +; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-LABEL: test_minsigned_i8: @@ -731,14 +729,17 @@ define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpb $-128, %al ; X86-NEXT: jne .LBB17_1 -; X86-NEXT: # %bb.2: # %select.end -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # %bb.2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl -; X86-NEXT: .LBB17_1: # %select.false.sink +; X86-NEXT: .LBB17_1: ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sarb $7, %cl ; X86-NEXT: xorb %cl, %al ; X86-NEXT: subb %cl, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl %lim = icmp eq i8 %a0, -128 %abs = tail call i8 @llvm.abs.i8(i8 %a0, i1 false) @@ -749,30 +750,26 @@ define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind { define i16 @test_minsigned_i16(i16 %a0, i16 %a1) nounwind { ; X64-LABEL: test_minsigned_i16: ; X64: # %bb.0: -; X64-NEXT: movzwl %di, %eax -; X64-NEXT: cmpl $32768, %eax # imm = 0x8000 -; X64-NEXT: jne .LBB18_1 -; X64-NEXT: # %bb.2: # %select.end -; X64-NEXT: movl %esi, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB18_1: # %select.false.sink -; X64-NEXT: movl %edi, %eax +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: negw %ax -; X64-NEXT: cmovsw %di, %ax +; X64-NEXT: cmovsw %cx, %ax +; X64-NEXT: cmpl $32768, %ecx # imm = 0x8000 +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: test_minsigned_i16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $32768, %ecx # imm = 0x8000 -; X86-NEXT: jne .LBB18_1 -; X86-NEXT: # %bb.2: # %select.end -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: retl -; X86-NEXT: .LBB18_1: # %select.false.sink ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negw %ax ; X86-NEXT: cmovsw %cx, %ax +; X86-NEXT: cmpl $32768, %ecx # imm = 0x8000 +; X86-NEXT: jne .LBB18_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: .LBB18_2: ; X86-NEXT: retl %lim = icmp eq i16 %a0, -32768 %abs = tail call i16 @llvm.abs.i16(i16 %a0, i1 false) @@ -783,29 +780,24 @@ define i16 @test_minsigned_i16(i16 %a0, i16 %a1) nounwind { define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind { ; X64-LABEL: test_minsigned_i32: ; X64: # %bb.0: -; X64-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000 -; X64-NEXT: jne .LBB19_1 -; X64-NEXT: # %bb.2: # %select.end -; X64-NEXT: movl %esi, %eax -; X64-NEXT: retq -; X64-NEXT: .LBB19_1: # %select.false.sink ; X64-NEXT: movl %edi, %eax ; X64-NEXT: negl %eax ; X64-NEXT: cmovsl %edi, %eax +; X64-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000 +; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq ; ; X86-LABEL: test_minsigned_i32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl $-2147483648, %ecx # imm = 0x80000000 -; X86-NEXT: jne .LBB19_1 -; X86-NEXT: # %bb.2: # %select.end -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: retl -; X86-NEXT: .LBB19_1: # %select.false.sink ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax ; X86-NEXT: cmovsl %ecx, %eax +; X86-NEXT: cmpl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: jne .LBB19_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: .LBB19_2: ; X86-NEXT: retl %lim = icmp eq i32 %a0, -2147483648 %abs = tail call i32 @llvm.abs.i32(i32 %a0, i1 false) @@ -816,16 +808,12 @@ define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind { define i64 @test_minsigned_i64(i64 %a0, i64 %a1) nounwind { ; X64-LABEL: test_minsigned_i64: ; X64: # %bb.0: -; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; X64-NEXT: cmpq %rax, %rdi -; X64-NEXT: jne .LBB20_1 -; X64-NEXT: # %bb.2: # %select.end -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: retq -; X64-NEXT: .LBB20_1: # %select.false.sink ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: negq %rax ; X64-NEXT: cmovsq %rdi, %rax +; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; X64-NEXT: cmpq %rcx, %rdi +; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq ; ; X86-LABEL: test_minsigned_i64: From e5bdb7af86c2947ab138049f0aafd7de9381b944 Mon Sep 17 00:00:00 2001 From: c8ef Date: Tue, 11 Jun 2024 17:42:19 +0800 Subject: [PATCH 41/82] [InstCombine] fold ldexp(x, sext(i1 y)) to fmul x, (select y, 0.5, 1.0) (#95073) Follow up of #94887. Context: https://github.com/llvm/llvm-project/pull/94887#pullrequestreview-2106213891 --- .../InstCombine/InstCombineCalls.cpp | 8 +++ .../{ldexp-zext.ll => ldexp-ext.ll} | 55 +++++++++++++++++++ 2 files changed, 63 insertions(+) rename llvm/test/Transforms/InstCombine/{ldexp-zext.ll => ldexp-ext.ll} (51%) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 4346a07e3a2cb2..436cdbff756699 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2619,6 +2619,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } // ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0) + // ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0) Value *ExtSrc; if (match(Exp, m_ZExt(m_Value(ExtSrc))) && ExtSrc->getType()->getScalarSizeInBits() == 1) { @@ -2627,6 +2628,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { ConstantFP::get(II->getType(), 1.0)); return BinaryOperator::CreateFMulFMF(Src, Select, II); } + if (match(Exp, m_SExt(m_Value(ExtSrc))) && + ExtSrc->getType()->getScalarSizeInBits() == 1) { + Value *Select = + Builder.CreateSelect(ExtSrc, ConstantFP::get(II->getType(), 0.5), + ConstantFP::get(II->getType(), 1.0)); + return BinaryOperator::CreateFMulFMF(Src, Select, II); + } break; } diff --git a/llvm/test/Transforms/InstCombine/ldexp-zext.ll b/llvm/test/Transforms/InstCombine/ldexp-ext.ll similarity index 51% rename from llvm/test/Transforms/InstCombine/ldexp-zext.ll rename to llvm/test/Transforms/InstCombine/ldexp-ext.ll index b6e4f124940595..4608553eb88743 100644 --- a/llvm/test/Transforms/InstCombine/ldexp-zext.ll +++ b/llvm/test/Transforms/InstCombine/ldexp-ext.ll @@ -55,3 +55,58 @@ define <2 x float> @ldexp_zext_float_vector(<2 x float> %x, <2 x i1> %bool) { %ldexp = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %zext) ret <2 x float> %ldexp } + +define float @ldexp_sext_float(float %x, i1 %bool) { +; CHECK-LABEL: @ldexp_sext_float( +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], float 5.000000e-01, float 1.000000e+00 +; CHECK-NEXT: [[LDEXP:%.*]] = fmul float [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret float [[LDEXP]] +; + %sext = sext i1 %bool to i32 + %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %sext) + ret float %ldexp +} + +define float @ldexp_sext_float_negative(float %x, i8 %y) { +; CHECK-LABEL: @ldexp_sext_float_negative( +; CHECK-NEXT: [[SEXT:%.*]] = sext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[LDEXP:%.*]] = call float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[SEXT]]) +; CHECK-NEXT: ret float [[LDEXP]] +; + %sext = sext i8 %y to i32 + %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %sext) + ret float %ldexp +} + +define double @ldexp_sext_double(double %x, i1 %bool) { +; CHECK-LABEL: @ldexp_sext_double( +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 5.000000e-01, double 1.000000e+00 +; CHECK-NEXT: [[LDEXP:%.*]] = fmul double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret double [[LDEXP]] +; + %sext = sext i1 %bool to i32 + %ldexp = call double @llvm.ldexp.f64.i32(double %x, i32 %sext) + ret double %ldexp +} + +define double @ldexp_sext_double_fast_math(double %x, i1 %bool) { +; CHECK-LABEL: @ldexp_sext_double_fast_math( +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 5.000000e-01, double 1.000000e+00 +; CHECK-NEXT: [[LDEXP:%.*]] = fmul reassoc double [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret double [[LDEXP]] +; + %sext = sext i1 %bool to i32 + %ldexp = call reassoc double @llvm.ldexp.f64.i32(double %x, i32 %sext) + ret double %ldexp +} + +define <2 x float> @ldexp_sext_float_vector(<2 x float> %x, <2 x i1> %bool) { +; CHECK-LABEL: @ldexp_sext_float_vector( +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[BOOL:%.*]], <2 x float> , <2 x float> +; CHECK-NEXT: [[LDEXP:%.*]] = fmul <2 x float> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret <2 x float> [[LDEXP]] +; + %sext = sext <2 x i1> %bool to <2 x i32> + %ldexp = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %sext) + ret <2 x float> %ldexp +} From 9dfd0760cb8b4f7fdb32008ade70c605e844051e Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 11 Jun 2024 10:55:25 +0100 Subject: [PATCH 42/82] [test] Fix documentation of %{fs-sep} et al (#95088) --- llvm/docs/TestingGuide.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index 89499d8b937d5c..c35e58bc53b671 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -741,16 +741,16 @@ RUN lines: Expands to the path separator, i.e. ``:`` (or ``;`` on Windows). -``${fs-src-root}`` +``%{fs-src-root}`` Expands to the root component of file system paths for the source directory, i.e. ``/`` on Unix systems or ``C:\`` (or another drive) on Windows. -``${fs-tmp-root}`` +``%{fs-tmp-root}`` Expands to the root component of file system paths for the test's temporary directory, i.e. ``/`` on Unix systems or ``C:\`` (or another drive) on Windows. -``${fs-sep}`` +``%{fs-sep}`` Expands to the file system separator, i.e. ``/`` or ``\`` on Windows. ``%/s, %/S, %/t, %/T`` From 2ca8c856eeae739ec1e7242ee7e69f99ecf376d3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Jun 2024 11:23:00 +0100 Subject: [PATCH 43/82] [X86] is_fpclass.ll - add NDD test coverage --- llvm/test/CodeGen/X86/is_fpclass.ll | 4309 ++++++++++++++------------- 1 file changed, 2297 insertions(+), 2012 deletions(-) diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll index 2046d790cc57e4..999be0f98b6fc5 100644 --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -1,1336 +1,1414 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefix=CHECK-32 -; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK-64 +; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-GENERIC +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+ndd | FileCheck %s -check-prefixes=X64,X64-NDD define i1 @isnan_f(float %x) { -; CHECK-32-LABEL: isnan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-64-NEXT: setp %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %al +; X86-NEXT: retl +; +; X64-LABEL: isnan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: setp %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; "nan" ret i1 %0 } define i1 @isnot_nan_f(float %x) { -; CHECK-32-LABEL: isnot_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setnp %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnot_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-64-NEXT: setnp %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnot_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setnp %al +; X86-NEXT: retl +; +; X64-LABEL: isnot_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: setnp %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1020) ; 0x3fc = "zero|subnormal|normal|inf" ret i1 %0 } define i1 @issignaling_f(float %x) { -; CHECK-32-LABEL: issignaling_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issignaling_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %cl -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: issignaling_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: issignaling_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setl %cl +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1) ; "snan" ret i1 %0 } define i1 @not_issignaling_f(float %x) { -; CHECK-32-LABEL: not_issignaling_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issignaling_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issignaling_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setl %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_issignaling_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %cl +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setl %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1022) ; ~"snan" ret i1 %0 } define i1 @isquiet_f(float %x) { -; CHECK-32-LABEL: isquiet_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isquiet_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: retq +; X86-LABEL: isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-LABEL: isquiet_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2) ; "qnan" ret i1 %0 } define i1 @not_isquiet_f(float %x) { -; CHECK-32-LABEL: not_isquiet_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isquiet_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isquiet_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-LABEL: not_isquiet_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setl %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021) ; ~"qnan" ret i1 %0 } define i1 @isinf_f(float %x) { -; CHECK-32-LABEL: isinf_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isinf_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: isinf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" ret i1 %0 } define i1 @not_isinf_f(float %x) { -; CHECK-32-LABEL: not_isinf_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isinf_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isinf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_isinf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; ~0x204 = "~inf" ret i1 %0 } define i1 @is_plus_inf_f(float %x) { -; CHECK-32-LABEL: is_plus_inf_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_inf_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_inf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512) ; 0x200 = "+inf" ret i1 %0 } define i1 @is_minus_inf_f(float %x) { -; CHECK-32-LABEL: is_minus_inf_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_inf_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: is_minus_inf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4) ; "-inf" ret i1 %0 } define i1 @not_is_minus_inf_f(float %x) { -; CHECK-32-LABEL: not_is_minus_inf_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_inf_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_inf_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_minus_inf_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019) ; ~"-inf" ret i1 %0 } define i1 @isfinite_f(float %x) { -; CHECK-32-LABEL: isfinite_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isfinite_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: retq +; X86-LABEL: isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-LABEL: isfinite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" ret i1 %0 } define i1 @not_isfinite_f(float %x) { -; CHECK-32-LABEL: not_isfinite_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isfinite_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isfinite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-LABEL: not_isfinite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setge %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; ~0x1f8 = "~finite" ret i1 %0 } define i1 @is_plus_finite_f(float %x) { -; CHECK-32-LABEL: is_plus_finite_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_finite_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_finite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setb %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448) ; 0x1c0 = "+finite" ret i1 %0 } define i1 @not_is_plus_finite_f(float %x) { -; CHECK-32-LABEL: not_is_plus_finite_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_plus_finite_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_plus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setae %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_plus_finite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setae %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 575) ; ~0x1c0 = ~"+finite" ret i1 %0 } define i1 @is_minus_finite_f(float %x) { -; CHECK-32-LABEL: is_minus_finite_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sets %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_finite_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sets %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sets %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_minus_finite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sets %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 56) ; 0x38 = "-finite" ret i1 %0 } define i1 @not_is_minus_finite_f(float %x) { -; CHECK-32-LABEL: not_is_minus_finite_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: setns %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_finite_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: setns %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_finite_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setns %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_minus_finite_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setns %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 967) ; ~0x38 = ~"-finite" ret i1 %0 } define i1 @isnormal_f(float %x) #1 { -; CHECK-32-LABEL: isnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: isnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NEXT: setb %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 264) ; 0x108 = "normal" ret i1 %0 } define i1 @not_isnormal_f(float %x) #1 { -; CHECK-32-LABEL: not_isnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setae %al +; X86-NEXT: retl +; +; X64-LABEL: not_isnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NEXT: setae %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 759) ; ~0x108 = "~normal" ret i1 %0 } define i1 @is_plus_normal_f(float %x) { -; CHECK-32-LABEL: is_plus_normal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: setns %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_normal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: setns %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_normal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setns %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_normal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setns %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NEXT: setb %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 256) ; 0x100 = "+normal" ret i1 %0 } define i1 @issubnormal_f(float %x) { -; CHECK-32-LABEL: issubnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setb %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 144) ; 0x90 = "subnormal" ret i1 %0 } define i1 @issubnormal_f_daz(float %x) #0 { -; CHECK-32-LABEL: issubnormal_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setb %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 144) ; 0x90 = "subnormal" ret i1 %0 } define i1 @issubnormal_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: issubnormal_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setb %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 144) ; 0x90 = "subnormal" ret i1 %0 } define i1 @not_issubnormal_f(float %x) { -; CHECK-32-LABEL: not_issubnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setae %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setae %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 879) ; ~0x90 = "~subnormal" ret i1 %0 } define i1 @not_issubnormal_f_daz(float %x) #0 { -; CHECK-32-LABEL: not_issubnormal_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setae %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setae %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 879) ; ~0x90 = "~subnormal" ret i1 %0 } define i1 @not_issubnormal_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: not_issubnormal_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setae %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setae %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 879) ; ~0x90 = "~subnormal" ret i1 %0 } define i1 @is_plus_subnormal_f(float %x) { -; CHECK-32-LABEL: is_plus_subnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_subnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_subnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_subnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setb %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 128) ; 0x80 = "+subnormal" ret i1 %0 } define i1 @not_is_plus_subnormal_f(float %x) { -; CHECK-32-LABEL: not_is_plus_subnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_plus_subnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_plus_subnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setae %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_plus_subnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setae %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 895) ; ~0x80 = ~"+subnormal" ret i1 %0 } define i1 @is_minus_subnormal_f(float %x) { -; CHECK-32-LABEL: is_minus_subnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sets %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_subnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sets %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_subnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sets %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setb %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_minus_subnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sets %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setb %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 16) ; 0x10 = "-subnormal" ret i1 %0 } define i1 @not_is_minus_subnormal_f(float %x) { -; CHECK-32-LABEL: not_is_minus_subnormal_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: setns %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setae %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_subnormal_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: setns %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setae %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_subnormal_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setns %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setae %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_minus_subnormal_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setns %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setae %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1007) ; ~0x10 = ~"-subnormal" ret i1 %0 } define i1 @iszero_f(float %x) { -; CHECK-32-LABEL: iszero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96) ; 0x60 = "zero" ret i1 %0 } define i1 @iszero_f_daz(float %x) #0 { -; CHECK-32-LABEL: iszero_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96) ; 0x60 = "zero" ret i1 %0 } define i1 @iszero_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: iszero_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96) ; 0x60 = "zero" ret i1 %0 } define i1 @not_iszero_f(float %x) { -; CHECK-32-LABEL: not_iszero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927) ; ~0x60 = "~zero" ret i1 %0 } define i1 @not_iszero_f_daz(float %x) #0 { -; CHECK-32-LABEL: not_iszero_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927) ; ~0x60 = "~zero" ret i1 %0 } define i1 @not_iszero_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: not_iszero_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927) ; ~0x60 = "~zero" ret i1 %0 } define i1 @issubnormal_or_zero_f(float %x) { -; CHECK-32-LABEL: issubnormal_or_zero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_zero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 240) ; 0xf0 = "subnormal|zero" ret i1 %0 } define i1 @issubnormal_or_zero_f_daz(float %x) #0 { -; CHECK-32-LABEL: issubnormal_or_zero_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_zero_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 240) ; 0xf0 = "subnormal|zero" ret i1 %0 } define i1 @issubnormal_or_zero_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: issubnormal_or_zero_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_zero_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 240) ; 0xf0 = "subnormal|zero" ret i1 %0 } define i1 @not_issubnormal_or_zero_f(float %x) { -; CHECK-32-LABEL: not_issubnormal_or_zero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_or_zero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 783) ; ~0xf0 = "~(subnormal|zero)" ret i1 %0 } define i1 @not_issubnormal_or_zero_f_daz(float %x) #0 { -; CHECK-32-LABEL: not_issubnormal_or_zero_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_or_zero_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 783) ; ~0xf0 = "~(subnormal|zero)" ret i1 %0 } define i1 @not_issubnormal_or_zero_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: not_issubnormal_or_zero_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_or_zero_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 783) ; ~0xf0 = "~(subnormal|zero)" ret i1 %0 } define i1 @is_plus_zero_f(float %x) { -; CHECK-32-LABEL: is_plus_zero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_zero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_zero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_zero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 64) ; 0x40 = "+zero" ret i1 %0 } define i1 @not_is_plus_zero_f(float %x) { -; CHECK-32-LABEL: not_is_plus_zero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_plus_zero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_plus_zero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_plus_zero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 959) ; ~0x40 = ~"+zero" ret i1 %0 } define i1 @is_minus_zero_f(float %x) { -; CHECK-32-LABEL: is_minus_zero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_zero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_zero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: is_minus_zero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 32) ; 0x20 = "-zero" ret i1 %0 } define i1 @not_is_minus_zero_f(float %x) { -; CHECK-32-LABEL: not_is_minus_zero_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_zero_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_zero_f: +; X86: # %bb.0: # %entry +; X86-NEXT: cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_minus_zero_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-2147483648, %eax # imm = 0x80000000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 991) ; ~0x20 = ~"-zero" ret i1 %0 } define i1 @isnan_f_strictfp(float %x) strictfp { -; CHECK-32-LABEL: isnan_f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-LABEL: isnan_f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; "nan" ret i1 %0 } define i1 @not_isnan_f_strictfp(float %x) strictfp { -; CHECK-32-LABEL: not_isnan_f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isnan_f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isnan_f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-LABEL: not_isnan_f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setl %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1020) strictfp ; ~"nan" ret i1 %0 } define i1 @isfinite_f_strictfp(float %x) strictfp { -; CHECK-32-LABEL: isfinite_f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isfinite_f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: retq +; X86-LABEL: isfinite_f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-LABEL: isfinite_f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) strictfp ; 0x1f8 = "finite" ret i1 %0 } define i1 @not_isfinite_f_strictfp(float %x) strictfp { -; CHECK-32-LABEL: not_isfinite_f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isfinite_f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isfinite_f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-LABEL: not_isfinite_f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setge %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) strictfp ; ~0x1f8 = ~"finite" ret i1 %0 } define i1 @iszero_f_strictfp(float %x) strictfp { -; CHECK-32-LABEL: iszero_f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96) strictfp ; 0x60 = "zero" ret i1 %0 } define i1 @not_iszero_f_strictfp(float %x) strictfp { -; CHECK-32-LABEL: not_iszero_f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927) strictfp ; ~0x60 = ~"zero" ret i1 %0 } define i1 @isnan_d(double %x) { -; CHECK-32-LABEL: isnan_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: fldl {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: ucomisd %xmm0, %xmm0 -; CHECK-64-NEXT: setp %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_d: +; X86: # %bb.0: # %entry +; X86-NEXT: fldl {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %al +; X86-NEXT: retl +; +; X64-LABEL: isnan_d: +; X64: # %bb.0: # %entry +; X64-NEXT: ucomisd %xmm0, %xmm0 +; X64-NEXT: setp %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; "nan" ret i1 %0 } define i1 @isinf_d(double %x) { -; CHECK-32-LABEL: isinf_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: xorl $2146435072, %eax # imm = 0x7FF00000 -; CHECK-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isinf_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: isinf_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl $2146435072, %eax # imm = 0x7FF00000 +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: isinf_d: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: sete %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: isinf_d: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NDD-NEXT: cmpq %rcx, %rax +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 516) ; 0x204 = "inf" ret i1 %0 } define i1 @isfinite_d(double %x) { -; CHECK-32-LABEL: isfinite_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2146435072, %eax # imm = 0x7FF00000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isfinite_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: retq +; X86-LABEL: isfinite_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2146435072, %eax # imm = 0x7FF00000 +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: isfinite_d: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: setl %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: isfinite_d: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NDD-NEXT: cmpq %rcx, %rax +; X64-NDD-NEXT: setl %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 504) ; 0x1f8 = "finite" ret i1 %0 } define i1 @isnormal_d(double %x) { -; CHECK-32-LABEL: isnormal_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: addl $-1048576, %eax # imm = 0xFFF00000 -; CHECK-32-NEXT: shrl $21, %eax -; CHECK-32-NEXT: cmpl $1023, %eax # imm = 0x3FF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnormal_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: movabsq $-4503599627370496, %rax # imm = 0xFFF0000000000000 -; CHECK-64-NEXT: addq %rcx, %rax -; CHECK-64-NEXT: shrq $53, %rax -; CHECK-64-NEXT: cmpl $1023, %eax # imm = 0x3FF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnormal_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $-1048576, %eax # imm = 0xFFF00000 +; X86-NEXT: shrl $21, %eax +; X86-NEXT: cmpl $1023, %eax # imm = 0x3FF +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: isnormal_d: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: movabsq $-4503599627370496, %rax # imm = 0xFFF0000000000000 +; X64-GENERIC-NEXT: addq %rcx, %rax +; X64-GENERIC-NEXT: shrq $53, %rax +; X64-GENERIC-NEXT: cmpl $1023, %eax # imm = 0x3FF +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: isnormal_d: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: movabsq $-4503599627370496, %rcx # imm = 0xFFF0000000000000 +; X64-NDD-NEXT: addq %rcx, %rax +; X64-NDD-NEXT: shrq $53, %rax +; X64-NDD-NEXT: cmpl $1023, %eax # imm = 0x3FF +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 264) ; 0x108 = "normal" ret i1 %0 } define i1 @issubnormal_d(double %x) { -; CHECK-32-LABEL: issubnormal_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: addl $-1, %eax -; CHECK-32-NEXT: adcl $-1, %ecx -; CHECK-32-NEXT: cmpl $-1, %eax -; CHECK-32-NEXT: sbbl $1048575, %ecx # imm = 0xFFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: decq %rcx -; CHECK-64-NEXT: movabsq $4503599627370495, %rax # imm = 0xFFFFFFFFFFFFF -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl $-1, %eax +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: sbbl $1048575, %ecx # imm = 0xFFFFF +; X86-NEXT: setb %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: issubnormal_d: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: decq %rcx +; X64-GENERIC-NEXT: movabsq $4503599627370495, %rax # imm = 0xFFFFFFFFFFFFF +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: issubnormal_d: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: decq %rax +; X64-NDD-NEXT: movabsq $4503599627370495, %rcx # imm = 0xFFFFFFFFFFFFF +; X64-NDD-NEXT: cmpq %rcx, %rax +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 144) ; 0x90 = "subnormal" ret i1 %0 } define i1 @iszero_d(double %x) { -; CHECK-32-LABEL: iszero_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: shlq %rax -; CHECK-64-NEXT: testq %rax, %rax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_d: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: shlq %rax +; X64-NEXT: testq %rax, %rax +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 96) ; 0x60 = "zero" ret i1 %0 } define i1 @issignaling_d(double %x) { -; CHECK-32-LABEL: issignaling_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: xorl %ecx, %ecx -; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movl $2146435072, %ecx # imm = 0x7FF00000 -; CHECK-32-NEXT: sbbl %eax, %ecx -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: cmpl $2146959360, %eax # imm = 0x7FF80000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issignaling_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: setg %al -; CHECK-64-NEXT: andb %dl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: issignaling_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2146435072, %ecx # imm = 0x7FF00000 +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2146959360, %eax # imm = 0x7FF80000 +; X86-NEXT: setl %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: issignaling_d: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: setg %al +; X64-GENERIC-NEXT: andb %dl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: issignaling_d: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: movabsq $9221120237041090560, %rcx # imm = 0x7FF8000000000000 +; X64-NDD-NEXT: cmpq %rcx, %rax +; X64-NDD-NEXT: setl %cl +; X64-NDD-NEXT: movabsq $9218868437227405312, %rdx # imm = 0x7FF0000000000000 +; X64-NDD-NEXT: cmpq %rdx, %rax +; X64-NDD-NEXT: setg %al +; X64-NDD-NEXT: andb %cl, %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1) ; "snan" ret i1 %0 } define i1 @isquiet_d(double %x) { -; CHECK-32-LABEL: isquiet_d: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2146959360, %eax # imm = 0x7FF80000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isquiet_d: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: movabsq $9221120237041090559, %rax # imm = 0x7FF7FFFFFFFFFFFF -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: setg %al -; CHECK-64-NEXT: retq +; X86-LABEL: isquiet_d: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2146959360, %eax # imm = 0x7FF80000 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: isquiet_d: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: movabsq $9221120237041090559, %rax # imm = 0x7FF7FFFFFFFFFFFF +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: setg %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: isquiet_d: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: movabsq $9221120237041090559, %rcx # imm = 0x7FF7FFFFFFFFFFFF +; X64-NDD-NEXT: cmpq %rcx, %rax +; X64-NDD-NEXT: setg %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 2) ; "qnan" ret i1 %0 } define i1 @isnan_d_strictfp(double %x) strictfp { -; CHECK-32-LABEL: isnan_d_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: xorl %ecx, %ecx -; CHECK-32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: movl $2146435072, %ecx # imm = 0x7FF00000 -; CHECK-32-NEXT: sbbl %eax, %ecx -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_d_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-64-NEXT: andq %rax, %rcx -; CHECK-64-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 -; CHECK-64-NEXT: cmpq %rax, %rcx -; CHECK-64-NEXT: setg %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_d_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2146435072, %ecx # imm = 0x7FF00000 +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: isnan_d_strictfp: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movq %xmm0, %rax +; X64-GENERIC-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-GENERIC-NEXT: andq %rax, %rcx +; X64-GENERIC-NEXT: movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000 +; X64-GENERIC-NEXT: cmpq %rax, %rcx +; X64-GENERIC-NEXT: setg %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: isnan_d_strictfp: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movq %xmm0, %rax +; X64-NDD-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; X64-NDD-NEXT: andq %rcx, %rax +; X64-NDD-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000 +; X64-NDD-NEXT: cmpq %rcx, %rax +; X64-NDD-NEXT: setg %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; "nan" ret i1 %0 } define i1 @iszero_d_strictfp(double %x) strictfp { -; CHECK-32-LABEL: iszero_d_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: orl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_d_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movq %xmm0, %rax -; CHECK-64-NEXT: shlq %rax -; CHECK-64-NEXT: testq %rax, %rax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_d_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_d_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %xmm0, %rax +; X64-NEXT: shlq %rax +; X64-NEXT: testq %rax, %rax +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 96) strictfp ; 0x60 = "zero" ret i1 %0 @@ -1339,70 +1417,70 @@ entry: define <1 x i1> @isnan_v1f(<1 x float> %x) { -; CHECK-32-LABEL: isnan_v1f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_v1f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-64-NEXT: setp %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_v1f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %al +; X86-NEXT: retl +; +; X64-LABEL: isnan_v1f: +; X64: # %bb.0: # %entry +; X64-NEXT: ucomiss %xmm0, %xmm0 +; X64-NEXT: setp %al +; X64-NEXT: retq entry: %0 = tail call <1 x i1> @llvm.is.fpclass.v1f32(<1 x float> %x, i32 3) ; "nan" ret <1 x i1> %0 } define <1 x i1> @isnan_v1f_strictfp(<1 x float> %x) strictfp { -; CHECK-32-LABEL: isnan_v1f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_v1f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_v1f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-LABEL: isnan_v1f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: retq entry: %0 = tail call <1 x i1> @llvm.is.fpclass.v1f32(<1 x float> %x, i32 3) strictfp ; "nan" ret <1 x i1> %0 } define <2 x i1> @isnan_v2f(<2 x float> %x) { -; CHECK-32-LABEL: isnan_v2f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %cl -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dl -; CHECK-32-NEXT: movl %ecx, %eax -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_v2f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: cmpunordps %xmm0, %xmm0 -; CHECK-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_v2f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %cl +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %dl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: isnan_v2f: +; X64: # %bb.0: # %entry +; X64-NEXT: cmpunordps %xmm0, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: retq entry: %0 = tail call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 3) ; "nan" ret <2 x i1> %0 @@ -1410,173 +1488,173 @@ entry: define <2 x i1> @isnot_nan_v2f(<2 x float> %x) { -; CHECK-32-LABEL: isnot_nan_v2f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setnp %cl -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setnp %dl -; CHECK-32-NEXT: movl %ecx, %eax -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnot_nan_v2f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: cmpordps %xmm0, %xmm0 -; CHECK-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; CHECK-64-NEXT: retq +; X86-LABEL: isnot_nan_v2f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setnp %cl +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setnp %dl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: retl +; +; X64-LABEL: isnot_nan_v2f: +; X64: # %bb.0: # %entry +; X64-NEXT: cmpordps %xmm0, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: retq entry: %0 = tail call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 1020) ; 0x3fc = "zero|subnormal|normal|inf" ret <2 x i1> %0 } define <2 x i1> @isnan_v2f_strictfp(<2 x float> %x) strictfp { -; CHECK-32-LABEL: isnan_v2f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl %ecx, %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isnan_v2f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; CHECK-64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_v2f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %dl +; X86-NEXT: retl +; +; X64-LABEL: isnan_v2f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: retq entry: %0 = tail call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 3) strictfp ; "nan" ret <2 x i1> %0 } define <4 x i1> @isnan_v4f(<4 x float> %x) { -; CHECK-32-LABEL: isnan_v4f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dh -; CHECK-32-NEXT: shlb $2, %dh -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dl -; CHECK-32-NEXT: shlb $3, %dl -; CHECK-32-NEXT: orb %dh, %dl -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %dh -; CHECK-32-NEXT: fucomp %st(0) -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setp %al -; CHECK-32-NEXT: addb %al, %al -; CHECK-32-NEXT: orb %dh, %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: movb %al, (%ecx) -; CHECK-32-NEXT: movl %ecx, %eax -; CHECK-32-NEXT: retl $4 -; -; CHECK-64-LABEL: isnan_v4f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: cmpunordps %xmm0, %xmm0 -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_v4f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %dh +; X86-NEXT: shlb $2, %dh +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %dl +; X86-NEXT: shlb $3, %dl +; X86-NEXT: orb %dh, %dl +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %dh +; X86-NEXT: fucomp %st(0) +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setp %al +; X86-NEXT: addb %al, %al +; X86-NEXT: orb %dh, %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: movb %al, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: retl $4 +; +; X64-LABEL: isnan_v4f: +; X64: # %bb.0: # %entry +; X64-NEXT: cmpunordps %xmm0, %xmm0 +; X64-NEXT: retq entry: %0 = tail call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %x, i32 3) ; "nan" ret <4 x i1> %0 } define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp { -; CHECK-32-LABEL: isnan_v4f_strictfp: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: pushl %esi -; CHECK-32-NEXT: .cfi_def_cfa_offset 8 -; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: andl %ecx, %edx -; CHECK-32-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh -; CHECK-32-NEXT: shlb $2, %dh -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-32-NEXT: andl %ecx, %esi -; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl -; CHECK-32-NEXT: shlb $3, %dl -; CHECK-32-NEXT: orb %dh, %dl -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-32-NEXT: andl %ecx, %esi -; CHECK-32-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: addb %cl, %cl -; CHECK-32-NEXT: orb %dh, %cl -; CHECK-32-NEXT: orb %dl, %cl -; CHECK-32-NEXT: movb %cl, (%eax) -; CHECK-32-NEXT: popl %esi -; CHECK-32-NEXT: .cfi_def_cfa_offset 4 -; CHECK-32-NEXT: retl $4 -; -; CHECK-64-LABEL: isnan_v4f_strictfp: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-64-NEXT: retq +; X86-LABEL: isnan_v4f_strictfp: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %ecx, %edx +; X86-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001 +; X86-NEXT: setge %dh +; X86-NEXT: shlb $2, %dh +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl %ecx, %esi +; X86-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 +; X86-NEXT: setge %dl +; X86-NEXT: shlb $3, %dl +; X86-NEXT: orb %dh, %dl +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl %ecx, %esi +; X86-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001 +; X86-NEXT: setge %dh +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: addb %cl, %cl +; X86-NEXT: orb %dh, %cl +; X86-NEXT: orb %dl, %cl +; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl $4 +; +; X64-LABEL: isnan_v4f_strictfp: +; X64: # %bb.0: # %entry +; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: retq entry: %0 = tail call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %x, i32 3) strictfp ; "nan" ret <4 x i1> %0 } define i1 @isnone_f(float %x) { -; CHECK-32-LABEL: isnone_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: xorl %eax, %eax -; CHECK-32-NEXT: retl +; X86-LABEL: isnone_f: +; X86: # %bb.0: # %entry +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl ; -; CHECK-64-LABEL: isnone_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: xorl %eax, %eax -; CHECK-64-NEXT: retq +; X64-LABEL: isnone_f: +; X64: # %bb.0: # %entry +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0) ret i1 %0 } define i1 @isany_f(float %x) { -; CHECK-32-LABEL: isany_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movb $1, %al -; CHECK-32-NEXT: retl +; X86-LABEL: isany_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movb $1, %al +; X86-NEXT: retl ; -; CHECK-64-LABEL: isany_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movb $1, %al -; CHECK-64-NEXT: retq +; X64-LABEL: isany_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movb $1, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023) ret i1 %0 @@ -1584,1056 +1662,1263 @@ entry: define i1 @iszero_or_nan_f(float %x) { -; CHECK-32-LABEL: iszero_or_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fldz -; CHECK-32-NEXT: fucompp -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_or_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: xorps %xmm1, %xmm1 -; CHECK-64-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_or_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fldz +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_or_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 } define i1 @iszero_or_nan_f_daz(float %x) #0 { -; CHECK-32-LABEL: iszero_or_nan_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_or_nan_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_or_nan_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_or_nan_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %cl +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 } define i1 @iszero_or_nan_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: iszero_or_nan_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_or_nan_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_or_nan_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_or_nan_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %cl +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 } define i1 @not_iszero_or_nan_f(float %x) { -; CHECK-32-LABEL: not_iszero_or_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fldz -; CHECK-32-NEXT: fucompp -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_or_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: xorps %xmm1, %xmm1 -; CHECK-64-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_or_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fldz +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_or_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 924) ; ~0x60 = "~(zero|nan)" ret i1 %0 } define i1 @not_iszero_or_nan_f_daz(float %x) #0 { -; CHECK-32-LABEL: not_iszero_or_nan_f_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_or_nan_f_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setl %cl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_or_nan_f_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setl %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_or_nan_f_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setl %cl +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" ret i1 %0 } define i1 @not_iszero_or_nan_f_maybe_daz(float %x) #1 { -; CHECK-32-LABEL: not_iszero_or_nan_f_maybe_daz: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_or_nan_f_maybe_daz: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setl %cl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_or_nan_f_maybe_daz: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setl %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_iszero_or_nan_f_maybe_daz: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setl %cl +; X64-NEXT: testl %eax, %eax +; X64-NEXT: setne %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" ret i1 %0 } define i1 @iszero_or_qnan_f(float %x) { -; CHECK-32-LABEL: iszero_or_qnan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_or_qnan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_or_qnan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: iszero_or_qnan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %cl +; X64-NEXT: testl %eax, %eax +; X64-NEXT: sete %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 98) ; 0x60|0x2 = "zero|qnan" ret i1 %0 } define i1 @iszero_or_snan_f(float %x) { -; CHECK-32-LABEL: iszero_or_snan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl -; CHECK-32-NEXT: andb %cl, %dl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: iszero_or_snan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %cl -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %dl -; CHECK-64-NEXT: andb %cl, %dl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: iszero_or_snan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %dl +; X86-NEXT: andb %cl, %dl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: iszero_or_snan_f: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %cl +; X64-GENERIC-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %dl +; X64-GENERIC-NEXT: andb %cl, %dl +; X64-GENERIC-NEXT: testl %eax, %eax +; X64-GENERIC-NEXT: sete %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: iszero_or_snan_f: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %cl +; X64-NDD-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NDD-NEXT: setge %dl +; X64-NDD-NEXT: andb %dl, %cl +; X64-NDD-NEXT: testl %eax, %eax +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 97) ; 0x60|0x1 = "zero|snan" ret i1 %0 } define i1 @not_iszero_or_qnan_f(float %x) { -; CHECK-32-LABEL: not_iszero_or_qnan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: pushl %esi -; CHECK-32-NEXT: .cfi_def_cfa_offset 8 -; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl -; CHECK-32-NEXT: andb %cl, %dl -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: leal -1(%eax), %esi -; CHECK-32-NEXT: cmpl $8388607, %esi # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %ch -; CHECK-32-NEXT: orb %cl, %ch -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: orb %ch, %al -; CHECK-32-NEXT: popl %esi -; CHECK-32-NEXT: .cfi_def_cfa_offset 4 -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_or_qnan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %cl -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %dl -; CHECK-64-NEXT: andb %cl, %dl -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: leal -1(%rax), %esi -; CHECK-64-NEXT: cmpl $8388607, %esi # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %sil -; CHECK-64-NEXT: orb %cl, %sil -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: orb %sil, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_or_qnan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %dl +; X86-NEXT: andb %cl, %dl +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: cmpl $8388607, %esi # imm = 0x7FFFFF +; X86-NEXT: setb %ch +; X86-NEXT: orb %cl, %ch +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: orb %ch, %al +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_iszero_or_qnan_f: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %cl +; X64-GENERIC-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %dl +; X64-GENERIC-NEXT: andb %cl, %dl +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %cl +; X64-GENERIC-NEXT: leal -1(%rax), %esi +; X64-GENERIC-NEXT: cmpl $8388607, %esi # imm = 0x7FFFFF +; X64-GENERIC-NEXT: setb %sil +; X64-GENERIC-NEXT: orb %cl, %sil +; X64-GENERIC-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: orb %sil, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_iszero_or_qnan_f: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %cl +; X64-NDD-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NDD-NEXT: setge %dl +; X64-NDD-NEXT: andb %dl, %cl +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %dl +; X64-NDD-NEXT: decl %eax, %esi +; X64-NDD-NEXT: cmpl $8388607, %esi # imm = 0x7FFFFF +; X64-NDD-NEXT: setb %sil +; X64-NDD-NEXT: orb %sil, %dl +; X64-NDD-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: orb %dl, %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 925) ; ~(0x60|0x2) = "~(zero|qnan)" ret i1 %0 } define i1 @not_iszero_or_snan_f(float %x) { -; CHECK-32-LABEL: not_iszero_or_snan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: leal -1(%eax), %edx -; CHECK-32-NEXT: cmpl $8388607, %edx # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %dl -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %ch -; CHECK-32-NEXT: orb %cl, %ch -; CHECK-32-NEXT: orb %dl, %ch -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: orb %ch, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_iszero_or_snan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: leal -1(%rax), %edx -; CHECK-64-NEXT: cmpl $8388607, %edx # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %dl -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %sil -; CHECK-64-NEXT: orb %cl, %sil -; CHECK-64-NEXT: orb %dl, %sil -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: orb %sil, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_iszero_or_snan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: leal -1(%eax), %edx +; X86-NEXT: cmpl $8388607, %edx # imm = 0x7FFFFF +; X86-NEXT: setb %dl +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %ch +; X86-NEXT: orb %cl, %ch +; X86-NEXT: orb %dl, %ch +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: orb %ch, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_iszero_or_snan_f: +; X64-GENERIC: # %bb.0: # %entry +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %cl +; X64-GENERIC-NEXT: leal -1(%rax), %edx +; X64-GENERIC-NEXT: cmpl $8388607, %edx # imm = 0x7FFFFF +; X64-GENERIC-NEXT: setb %dl +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setge %sil +; X64-GENERIC-NEXT: orb %cl, %sil +; X64-GENERIC-NEXT: orb %dl, %sil +; X64-GENERIC-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: orb %sil, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_iszero_or_snan_f: +; X64-NDD: # %bb.0: # %entry +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %cl +; X64-NDD-NEXT: decl %eax, %edx +; X64-NDD-NEXT: cmpl $8388607, %edx # imm = 0x7FFFFF +; X64-NDD-NEXT: setb %dl +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setge %sil +; X64-NDD-NEXT: orb %sil, %cl +; X64-NDD-NEXT: orb %dl, %cl +; X64-NDD-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 926) ; ~(0x60|0x1) = "~(zero|snan)" ret i1 %0 } define i1 @isinf_or_nan_f(float %x) { -; CHECK-32-LABEL: isinf_or_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isinf_or_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: retq +; X86-LABEL: isinf_or_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setge %al +; X86-NEXT: retl +; +; X64-LABEL: isinf_or_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setge %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) ; 0x204|0x3 = "inf|nan" ret i1 %0 } define i1 @not_isinf_or_nan_f(float %x) { -; CHECK-32-LABEL: not_isinf_or_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isinf_or_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isinf_or_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: retl +; +; X64-LABEL: not_isinf_or_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; ~(0x204|0x3) = "~(inf|nan)" ret i1 %0 } define i1 @isfinite_or_nan_f(float %x) { -; CHECK-32-LABEL: isfinite_or_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: isfinite_or_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: isfinite_or_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: isfinite_or_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507) ; 0x1f8|0x3 = "finite|nan" ret i1 %0 } define i1 @not_isfinite_or_nan_f(float %x) { -; CHECK-32-LABEL: not_isfinite_or_nan_f: -; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_isfinite_or_nan_f: -; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_isfinite_or_nan_f: +; X86: # %bb.0: # %entry +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: not_isfinite_or_nan_f: +; X64: # %bb.0: # %entry +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %al +; X64-NEXT: retq entry: %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; ~(0x1f8|0x3) = "~(finite|nan)" ret i1 %0 } define i1 @is_plus_inf_or_nan_f(float %x) { -; CHECK-32-LABEL: is_plus_inf_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_inf_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_inf_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_inf_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 515) ; 0x200|0x3 = "+inf|nan" ret i1 %class } define i1 @is_minus_inf_or_nan_f(float %x) { -; CHECK-32-LABEL: is_minus_inf_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_inf_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_inf_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_minus_inf_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 7) ; "-inf|nan" ret i1 %class } define i1 @not_is_plus_inf_or_nan_f(float %x) { -; CHECK-32-LABEL: not_is_plus_inf_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_plus_inf_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_plus_inf_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_plus_inf_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 508) ; ~(0x200|0x3) = "~(+inf|nan)" ret i1 %class } define i1 @not_is_minus_inf_or_nan_f(float %x) { -; CHECK-32-LABEL: not_is_minus_inf_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_inf_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_inf_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_is_minus_inf_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setl %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1016) ; "~(-inf|nan)" ret i1 %class } define i1 @is_plus_inf_or_snan_f(float %x) { -; CHECK-32-LABEL: is_plus_inf_or_snan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl %eax, %ecx -; CHECK-32-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: andb %dl, %cl -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_inf_or_snan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: movl %eax, %ecx -; CHECK-64-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: andb %dl, %cl -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_inf_or_snan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X86-NEXT: setl %dl +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: is_plus_inf_or_snan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: movl %eax, %ecx +; X64-GENERIC-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %cl +; X64-GENERIC-NEXT: andb %dl, %cl +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %al +; X64-GENERIC-NEXT: orb %cl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: is_plus_inf_or_snan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-NDD-NEXT: setge %cl +; X64-NDD-NEXT: andb %dl, %cl +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 513) ; 0x200|0x1 = "+inf|snan" ret i1 %class } define i1 @is_plus_inf_or_qnan_f(float %x) { -; CHECK-32-LABEL: is_plus_inf_or_qnan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_plus_inf_or_qnan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_plus_inf_or_qnan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_plus_inf_or_qnan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 514) ; 0x200|0x1 = "+inf|qnan" ret i1 %class } define i1 @not_is_plus_inf_or_snan_f(float %x) { -; CHECK-32-LABEL: not_is_plus_inf_or_snan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: orb %cl, %dl -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_plus_inf_or_snan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: orb %cl, %dl -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_plus_inf_or_snan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %dl +; X86-NEXT: orb %cl, %dl +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_is_plus_inf_or_snan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: sete %cl +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: orb %cl, %dl +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setge %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_is_plus_inf_or_snan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: sete %cl +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: orb %dl, %cl +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setge %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 510) ; ~(+inf|snan) ret i1 %class } define i1 @not_is_plus_inf_or_qnan_f(float %x) { -; CHECK-32-LABEL: not_is_plus_inf_or_qnan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl %eax, %ecx -; CHECK-32-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh -; CHECK-32-NEXT: andb %dl, %dh -; CHECK-32-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: sete %dl -; CHECK-32-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: orb %dh, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_plus_inf_or_qnan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: movl %eax, %ecx -; CHECK-64-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-64-NEXT: setge %sil -; CHECK-64-NEXT: andb %dl, %sil -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %dl -; CHECK-64-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: orb %sil, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_plus_inf_or_qnan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X86-NEXT: setl %dl +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %dh +; X86-NEXT: andb %dl, %dh +; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: sete %dl +; X86-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: orb %dh, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_is_plus_inf_or_qnan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: movl %eax, %ecx +; X64-GENERIC-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %sil +; X64-GENERIC-NEXT: andb %dl, %sil +; X64-GENERIC-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: sete %dl +; X64-GENERIC-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X64-GENERIC-NEXT: setl %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: orb %sil, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_is_plus_inf_or_qnan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-NDD-NEXT: setge %sil +; X64-NDD-NEXT: andb %sil, %dl +; X64-NDD-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X64-NDD-NEXT: setl %cl +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: orb %dl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 509) ; ~(+inf|qnan) ret i1 %class } define i1 @is_minus_inf_or_snan_f(float %x) { -; CHECK-32-LABEL: is_minus_inf_or_snan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl %eax, %ecx -; CHECK-32-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: andb %dl, %cl -; CHECK-32-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_inf_or_snan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: movl %eax, %ecx -; CHECK-64-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: andb %dl, %cl -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_inf_or_snan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X86-NEXT: setl %dl +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: is_minus_inf_or_snan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: movl %eax, %ecx +; X64-GENERIC-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %cl +; X64-GENERIC-NEXT: andb %dl, %cl +; X64-GENERIC-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: sete %al +; X64-GENERIC-NEXT: orb %cl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: is_minus_inf_or_snan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-NDD-NEXT: setge %cl +; X64-NDD-NEXT: andb %dl, %cl +; X64-NDD-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 5) ; "-inf|snan" ret i1 %class } define i1 @is_minus_inf_or_qnan_f(float %x) { -; CHECK-32-LABEL: is_minus_inf_or_qnan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: is_minus_inf_or_qnan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: is_minus_inf_or_qnan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: is_minus_inf_or_qnan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: cmpl $-8388608, %eax # imm = 0xFF800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 6) ; "-inf|qnan" ret i1 %class } define i1 @not_is_minus_inf_or_snan_f(float %x) { -; CHECK-32-LABEL: not_is_minus_inf_or_snan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: orb %cl, %dl -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_inf_or_snan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: orb %cl, %dl -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_inf_or_snan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setl %dl +; X86-NEXT: orb %cl, %dl +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_is_minus_inf_or_snan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %cl +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: orb %cl, %dl +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setge %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_is_minus_inf_or_snan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %cl +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: orb %dl, %cl +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setge %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1018) ; "~(-inf|snan)" ret i1 %class } define i1 @not_is_minus_inf_or_qnan_f(float %x) { -; CHECK-32-LABEL: not_is_minus_inf_or_qnan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl %eax, %ecx -; CHECK-32-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dh -; CHECK-32-NEXT: andb %dl, %dh -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %dl -; CHECK-32-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: orb %dh, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_is_minus_inf_or_qnan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: movl %eax, %ecx -; CHECK-64-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-64-NEXT: setge %sil -; CHECK-64-NEXT: andb %dl, %sil -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %dl -; CHECK-64-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: orb %sil, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_is_minus_inf_or_qnan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X86-NEXT: setl %dl +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %dh +; X86-NEXT: andb %dl, %dh +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %dl +; X86-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X86-NEXT: setl %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: orb %dh, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_is_minus_inf_or_qnan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: movl %eax, %ecx +; X64-GENERIC-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %sil +; X64-GENERIC-NEXT: andb %dl, %sil +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %dl +; X64-GENERIC-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X64-GENERIC-NEXT: setl %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: orb %sil, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_is_minus_inf_or_qnan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-NDD-NEXT: setge %sil +; X64-NDD-NEXT: andb %sil, %dl +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: cmpl $2139095040, %ecx # imm = 0x7F800000 +; X64-NDD-NEXT: setl %cl +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: orb %dl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1017) ; "-inf|qnan" ret i1 %class } define i1 @issubnormal_or_nan_f(float %x) { -; CHECK-32-LABEL: issubnormal_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: decl %eax -; CHECK-32-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: decl %eax -; CHECK-64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: decl %eax +; X86-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X86-NEXT: setb %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %cl +; X64-NEXT: decl %eax +; X64-NEXT: cmpl $8388607, %eax # imm = 0x7FFFFF +; X64-NEXT: setb %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 147) ; 0x90|0x3 = "subnormal|nan" ret i1 %class } define i1 @issubnormal_or_zero_or_nan_f(float %x) { -; CHECK-32-LABEL: issubnormal_or_zero_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_zero_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243) ; 0xf0|0x3 = "subnormal|zero|nan" ret i1 %class } define i1 @issubnormal_or_zero_or_nan_f_daz(float %x) #0 { -; CHECK-32-LABEL: issubnormal_or_zero_or_nan_f_daz: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fldz -; CHECK-32-NEXT: fucompp -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_or_nan_f_daz: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: xorps %xmm1, %xmm1 -; CHECK-64-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_or_nan_f_daz: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fldz +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_zero_or_nan_f_daz: +; X64: # %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243) ; 0xf0|0x3 = "subnormal|zero|nan" ret i1 %class } define i1 @issubnormal_or_zero_or_snan_f(float %x) { -; CHECK-32-LABEL: issubnormal_or_zero_or_snan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: movl %eax, %ecx -; CHECK-32-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %dl -; CHECK-32-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: andb %dl, %cl -; CHECK-32-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_or_snan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: movl %eax, %ecx -; CHECK-64-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %dl -; CHECK-64-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: andb %dl, %cl -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_or_snan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X86-NEXT: setl %dl +; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X86-NEXT: setge %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: issubnormal_or_zero_or_snan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: movl %eax, %ecx +; X64-GENERIC-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %dl +; X64-GENERIC-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %cl +; X64-GENERIC-NEXT: andb %dl, %cl +; X64-GENERIC-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %al +; X64-GENERIC-NEXT: orb %cl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: issubnormal_or_zero_or_snan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %ecx # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %dl +; X64-NDD-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001 +; X64-NDD-NEXT: setge %cl +; X64-NDD-NEXT: andb %dl, %cl +; X64-NDD-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 241) ; 0x90|0x1 = "subnormal|snan" ret i1 %class } define i1 @issubnormal_or_zero_or_qnan_f(float %x) { -; CHECK-32-LABEL: issubnormal_or_zero_or_qnan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: issubnormal_or_zero_or_qnan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: issubnormal_or_zero_or_qnan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: issubnormal_or_zero_or_qnan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: sete %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NEXT: setge %al +; X64-NEXT: orb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 242) ; 0x90|0x2 = "subnormal|qnan" ret i1 %class } define i1 @not_issubnormal_or_nan_f(float %x) { -; CHECK-32-LABEL: not_issubnormal_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: testl %eax, %eax -; CHECK-32-NEXT: sete %dl -; CHECK-32-NEXT: orb %cl, %dl -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: testl %eax, %eax -; CHECK-64-NEXT: sete %dl -; CHECK-64-NEXT: orb %cl, %dl -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %dl +; X86-NEXT: orb %cl, %dl +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_issubnormal_or_nan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %cl +; X64-GENERIC-NEXT: testl %eax, %eax +; X64-GENERIC-NEXT: sete %dl +; X64-GENERIC-NEXT: orb %cl, %dl +; X64-GENERIC-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_issubnormal_or_nan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %cl +; X64-NDD-NEXT: testl %eax, %eax +; X64-NDD-NEXT: sete %dl +; X64-NDD-NEXT: orb %dl, %cl +; X64-NDD-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 876) ; ~(0x90|0x3) = ~"subnormal|nan" ret i1 %class } define i1 @not_issubnormal_or_zero_or_nan_f(float %x) { -; CHECK-32-LABEL: not_issubnormal_or_zero_or_nan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: setne %cl -; CHECK-32-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setl %al -; CHECK-32-NEXT: andb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_or_nan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: setne %cl -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setl %al -; CHECK-64-NEXT: andb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_or_nan_f: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: setne %cl +; X86-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setl %al +; X86-NEXT: andb %cl, %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_or_zero_or_nan_f: +; X64: # %bb.0: +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: testl $2139095040, %eax # imm = 0x7F800000 +; X64-NEXT: setne %cl +; X64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NEXT: setl %al +; X64-NEXT: andb %cl, %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780) ; ~(0xf0|0x3) = ~"subnormal|zero|nan" ret i1 %class } define i1 @not_issubnormal_or_zero_or_nan_f_daz(float %x) #0 { -; CHECK-32-LABEL: not_issubnormal_or_zero_or_nan_f_daz: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: flds {{[0-9]+}}(%esp) -; CHECK-32-NEXT: fldz -; CHECK-32-NEXT: fucompp -; CHECK-32-NEXT: fnstsw %ax -; CHECK-32-NEXT: # kill: def $ah killed $ah killed $ax -; CHECK-32-NEXT: sahf -; CHECK-32-NEXT: setne %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_or_nan_f_daz: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: xorps %xmm1, %xmm1 -; CHECK-64-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-64-NEXT: setne %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_or_nan_f_daz: +; X86: # %bb.0: +; X86-NEXT: flds {{[0-9]+}}(%esp) +; X86-NEXT: fldz +; X86-NEXT: fucompp +; X86-NEXT: fnstsw %ax +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: not_issubnormal_or_zero_or_nan_f_daz: +; X64: # %bb.0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: ucomiss %xmm1, %xmm0 +; X64-NEXT: setne %al +; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780) ; ~(0xf0|0x3) = ~"subnormal|zero|nan" ret i1 %class } define i1 @not_issubnormal_or_zero_or_snan_f(float %x) { -; CHECK-32-LABEL: not_issubnormal_or_zero_or_snan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setge %cl -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %dl -; CHECK-32-NEXT: orb %cl, %dl -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: orb %dl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_or_snan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setge %cl -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %dl -; CHECK-64-NEXT: orb %cl, %dl -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: orb %dl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_or_snan_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setge %cl +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %dl +; X86-NEXT: orb %cl, %dl +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: orb %dl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_issubnormal_or_zero_or_snan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setge %cl +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %dl +; X64-GENERIC-NEXT: orb %cl, %dl +; X64-GENERIC-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: orb %dl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_issubnormal_or_zero_or_snan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setge %cl +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %dl +; X64-NDD-NEXT: orb %dl, %cl +; X64-NDD-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 782) ; ~(0x90|0x1) = ~"subnormal|snan" ret i1 %class } define i1 @not_issubnormal_or_zero_or_qnan_f(float %x) { -; CHECK-32-LABEL: not_issubnormal_or_zero_or_qnan_f: -; CHECK-32: # %bb.0: -; CHECK-32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-32-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-32-NEXT: setl %cl -; CHECK-32-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-32-NEXT: setge %dl -; CHECK-32-NEXT: andb %cl, %dl -; CHECK-32-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-32-NEXT: sete %cl -; CHECK-32-NEXT: orb %dl, %cl -; CHECK-32-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-32-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-32-NEXT: setb %al -; CHECK-32-NEXT: orb %cl, %al -; CHECK-32-NEXT: retl -; -; CHECK-64-LABEL: not_issubnormal_or_zero_or_qnan_f: -; CHECK-64: # %bb.0: -; CHECK-64-NEXT: movd %xmm0, %eax -; CHECK-64-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-64-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 -; CHECK-64-NEXT: setl %cl -; CHECK-64-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 -; CHECK-64-NEXT: setge %dl -; CHECK-64-NEXT: andb %cl, %dl -; CHECK-64-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 -; CHECK-64-NEXT: sete %cl -; CHECK-64-NEXT: orb %dl, %cl -; CHECK-64-NEXT: addl $-8388608, %eax # imm = 0xFF800000 -; CHECK-64-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 -; CHECK-64-NEXT: setb %al -; CHECK-64-NEXT: orb %cl, %al -; CHECK-64-NEXT: retq +; X86-LABEL: not_issubnormal_or_zero_or_qnan_f: +; X86: # %bb.0: +; X86-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X86-NEXT: setl %cl +; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X86-NEXT: setge %dl +; X86-NEXT: andb %cl, %dl +; X86-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X86-NEXT: sete %cl +; X86-NEXT: orb %dl, %cl +; X86-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X86-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X86-NEXT: setb %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: retl +; +; X64-GENERIC-LABEL: not_issubnormal_or_zero_or_qnan_f: +; X64-GENERIC: # %bb.0: +; X64-GENERIC-NEXT: movd %xmm0, %eax +; X64-GENERIC-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-GENERIC-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-GENERIC-NEXT: setl %cl +; X64-GENERIC-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-GENERIC-NEXT: setge %dl +; X64-GENERIC-NEXT: andb %cl, %dl +; X64-GENERIC-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-GENERIC-NEXT: sete %cl +; X64-GENERIC-NEXT: orb %dl, %cl +; X64-GENERIC-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-GENERIC-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-GENERIC-NEXT: setb %al +; X64-GENERIC-NEXT: orb %cl, %al +; X64-GENERIC-NEXT: retq +; +; X64-NDD-LABEL: not_issubnormal_or_zero_or_qnan_f: +; X64-NDD: # %bb.0: +; X64-NDD-NEXT: movd %xmm0, %eax +; X64-NDD-NEXT: andl $2147483647, %eax # imm = 0x7FFFFFFF +; X64-NDD-NEXT: cmpl $2143289344, %eax # imm = 0x7FC00000 +; X64-NDD-NEXT: setl %cl +; X64-NDD-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001 +; X64-NDD-NEXT: setge %dl +; X64-NDD-NEXT: andb %dl, %cl +; X64-NDD-NEXT: cmpl $2139095040, %eax # imm = 0x7F800000 +; X64-NDD-NEXT: sete %dl +; X64-NDD-NEXT: orb %dl, %cl +; X64-NDD-NEXT: addl $-8388608, %eax # imm = 0xFF800000 +; X64-NDD-NEXT: cmpl $2130706432, %eax # imm = 0x7F000000 +; X64-NDD-NEXT: setb %al +; X64-NDD-NEXT: orb %cl, %al +; X64-NDD-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 781) ; ~(0x90|0x2) = ~"subnormal|qnan" ret i1 %class } From 214ff5036cb407222e6ff34ef2c1eeef55c70b4a Mon Sep 17 00:00:00 2001 From: aengelke Date: Tue, 11 Jun 2024 13:08:55 +0200 Subject: [PATCH 44/82] [X86] Add AMXProgModel to YAML serialization (#94988) This allows tested passes to depend on the AMX model in the function info. Preparatory work for to adopt #94358 for other AMX passes. --- .../lib/Target/X86/X86MachineFunctionInfo.cpp | 13 ++++++++ llvm/lib/Target/X86/X86MachineFunctionInfo.h | 32 +++++++++++++++++++ llvm/lib/Target/X86/X86TargetMachine.cpp | 20 ++++++++++++ llvm/lib/Target/X86/X86TargetMachine.h | 8 +++++ .../CodeGen/X86/AMX/amx-fastconfig-phi.mir | 3 +- .../CodeGen/X86/AMX/amx-fastconfig-phi2.mir | 3 +- .../CodeGen/X86/AMX/amx-fastconfig-phi4.mir | 3 +- .../CodeGen/X86/AMX/amx-fastconfig-spill.mir | 6 ++-- llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir | 3 +- .../CodeGen/X86/AMX/amx-fastpreconfig.mir | 5 ++- 10 files changed, 89 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index 2e88e01ce7fdfa..7b57f7c23bf4da 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -13,6 +13,14 @@ using namespace llvm; +yaml::X86MachineFunctionInfo::X86MachineFunctionInfo( + const llvm::X86MachineFunctionInfo &MFI) + : AMXProgModel(MFI.getAMXProgModel()) {} + +void yaml::X86MachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) { + MappingTraits::mapping(YamlIO, *this); +} + MachineFunctionInfo *X86MachineFunctionInfo::clone( BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap &Src2DstMBB) @@ -20,6 +28,11 @@ MachineFunctionInfo *X86MachineFunctionInfo::clone( return DestMF.cloneInfo(*this); } +void X86MachineFunctionInfo::initializeBaseYamlFields( + const yaml::X86MachineFunctionInfo &YamlMFI) { + AMXProgModel = YamlMFI.AMXProgModel; +} + void X86MachineFunctionInfo::anchor() { } void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 8aaa49945f9d44..af2de2e73dc368 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -16,13 +16,43 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Support/YAMLTraits.h" #include namespace llvm { enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 }; +class X86MachineFunctionInfo; + +namespace yaml { +template <> struct ScalarEnumerationTraits { + static void enumeration(IO &YamlIO, AMXProgModelEnum &Value) { + YamlIO.enumCase(Value, "None", AMXProgModelEnum::None); + YamlIO.enumCase(Value, "DirectReg", AMXProgModelEnum::DirectReg); + YamlIO.enumCase(Value, "ManagedRA", AMXProgModelEnum::ManagedRA); + } +}; + +struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo { + AMXProgModelEnum AMXProgModel; + + X86MachineFunctionInfo() = default; + X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI); + + void mappingImpl(yaml::IO &YamlIO) override; + ~X86MachineFunctionInfo() = default; +}; + +template <> struct MappingTraits { + static void mapping(IO &YamlIO, X86MachineFunctionInfo &MFI) { + YamlIO.mapOptional("amxProgModel", MFI.AMXProgModel); + } +}; +} // end namespace yaml + /// X86MachineFunctionInfo - This class is derived from MachineFunction and /// contains private X86 target-specific information for each MachineFunction. class X86MachineFunctionInfo : public MachineFunctionInfo { @@ -160,6 +190,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { const DenseMap &Src2DstMBB) const override; + void initializeBaseYamlFields(const yaml::X86MachineFunctionInfo &YamlMFI); + bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 27542e54829bff..d4e642c7df9cf9 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -31,6 +31,8 @@ #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" +#include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" @@ -344,6 +346,24 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } +yaml::MachineFunctionInfo *X86TargetMachine::createDefaultFuncInfoYAML() const { + return new yaml::X86MachineFunctionInfo(); +} + +yaml::MachineFunctionInfo * +X86TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo(); + return new yaml::X86MachineFunctionInfo(*MFI); +} + +bool X86TargetMachine::parseMachineFunctionInfo( + const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, SMRange &SourceRange) const { + const auto &YamlMFI = static_cast(MFI); + PFS.MF.getInfo()->initializeBaseYamlFields(YamlMFI); + return false; +} + bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index 4a5f20fcc01726..916445c74bb903 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -58,6 +58,14 @@ class X86TargetMachine final : public LLVMTargetMachine { createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; + yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override; + yaml::MachineFunctionInfo * + convertFuncInfoToYAML(const MachineFunction &MF) const override; + bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, + PerFunctionMIParsingState &PFS, + SMDiagnostic &Error, + SMRange &SourceRange) const override; + void registerPassBuilderCallbacks(PassBuilder &PB, bool PopulateClassToPassNames) override; diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir index e79f4d1f989a91..eef1f43b278d9d 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir @@ -87,7 +87,8 @@ liveins: - { reg: '$rsi', virtual-reg: '%14' } frameInfo: maxAlignment: 1 -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | ; CHECK-LABEL: name: foo ; CHECK: bb.0.entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir index d47bda0044115e..5843366baab6d1 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir @@ -34,7 +34,8 @@ liveins: - { reg: '$edi', virtual-reg: '%12' } frameInfo: maxAlignment: 1 -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | ; CHECK-LABEL: name: foo ; CHECK: bb.0.entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir index 15d3eb6bdfebb6..4eb8b950851895 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir @@ -35,7 +35,8 @@ liveins: - { reg: '$edi', virtual-reg: '%12' } frameInfo: maxAlignment: 1 -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | ; CHECK-LABEL: name: foo ; CHECK: bb.0.entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir index 98744bbe8e1473..1ed4328bf132a1 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir @@ -23,7 +23,8 @@ frameInfo: stack: - { id: 0, size: 1024, alignment: 16 } - { id: 1, size: 64, alignment: 4 } -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | ; CHECK-LABEL: name: foo ; CHECK: bb.0.entry: @@ -100,7 +101,8 @@ frameInfo: stack: - { id: 0, size: 1024, alignment: 16 } - { id: 1, size: 64, alignment: 4 } -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | ; CHECK-LABEL: name: copy ; CHECK: bb.0.entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir index 84fc47a3a91202..561ba6f2f49709 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir +++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir @@ -77,7 +77,8 @@ liveins: - { reg: '$edx', virtual-reg: '%11' } frameInfo: maxAlignment: 1 -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | ; CHECK-LABEL: name: test_api ; CHECK: bb.0.entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir index 40566520b79f01..0d56feac626814 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir +++ b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir @@ -23,7 +23,8 @@ frameInfo: stack: - { id: 0, size: 1024, alignment: 16 } - { id: 1, size: 64, alignment: 4 } -machineFunctionInfo: {} +machineFunctionInfo: + amxProgModel: ManagedRA body: | bb.0.entry: ; CHECK-LABEL: name: main @@ -79,6 +80,8 @@ registers: liveins: - { reg: '$rdi', virtual-reg: '' } - { reg: '$rsi', virtual-reg: '' } +machineFunctionInfo: + amxProgModel: ManagedRA body: | bb.1.entry: liveins: $rdi, $rsi From fc6e97cf2f28a9c7a73b97488ec6b90fc0d34a4a Mon Sep 17 00:00:00 2001 From: aengelke Date: Tue, 11 Jun 2024 13:10:36 +0200 Subject: [PATCH 45/82] [MC][X86] Avoid copying MCInst in emitInstrEnd (#94947) Copying an MCInst isn't cheap (copies all operands) and the whole instruction is only used for the Intel erratum mitigation, which is off by default. In all other cases, the opcode alone suffices. This slightly pessimizes code that uses moves to segment registers -- but that's uncommon and not performance-sensitive anyway. As a related change, also call canPadInst() only when the result is actually used, which is typically only the case in emitInstrEnd. This gives a minor performance improvement. --- .../Target/X86/MCTargetDesc/X86AsmBackend.cpp | 53 ++++++++++++------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 30f22cd322fecf..bc2eb6dcd541c7 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -125,9 +125,10 @@ class X86AsmBackend : public MCAsmBackend { unsigned TargetPrefixMax = 0; MCInst PrevInst; + unsigned PrevInstOpcode = 0; MCBoundaryAlignFragment *PendingBA = nullptr; std::pair PrevInstPosition; - bool CanPadInst = false; + bool IsRightAfterData = false; uint8_t determinePaddingPrefix(const MCInst &Inst) const; bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; @@ -267,8 +268,8 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) { } /// Check if the instruction is a prefix. -static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) { - return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags); +static bool isPrefix(unsigned Opcode, const MCInstrInfo &MCII) { + return X86II::isPrefix(MCII.get(Opcode).TSFlags); } /// Check if the instruction is valid as the first instruction in macro fusion. @@ -382,9 +383,9 @@ bool X86AsmBackend::allowEnhancedRelaxation() const { /// X86 has certain instructions which enable interrupts exactly one /// instruction *after* the instruction which stores to SS. Return true if the -/// given instruction has such an interrupt delay slot. -static bool hasInterruptDelaySlot(const MCInst &Inst) { - switch (Inst.getOpcode()) { +/// given instruction may have such an interrupt delay slot. +static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) { + switch (InstOpcode) { case X86::POPSS16: case X86::POPSS32: case X86::STI: @@ -394,9 +395,9 @@ static bool hasInterruptDelaySlot(const MCInst &Inst) { case X86::MOV32sr: case X86::MOV64sr: case X86::MOV16sm: - if (Inst.getOperand(0).getReg() == X86::SS) - return true; - break; + // In fact, this is only the case if the first operand is SS. However, as + // segment moves occur extremely rarely, this is just a minor pessimization. + return true; } return false; } @@ -450,22 +451,22 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { // TLSCALL). return false; - if (hasInterruptDelaySlot(PrevInst)) + if (mayHaveInterruptDelaySlot(PrevInstOpcode)) // If this instruction follows an interrupt enabling instruction with a one // instruction delay, inserting a nop would change behavior. return false; - if (isPrefix(PrevInst, *MCII)) + if (isPrefix(PrevInstOpcode, *MCII)) // If this instruction follows a prefix, inserting a nop/prefix would change // semantic. return false; - if (isPrefix(Inst, *MCII)) + if (isPrefix(Inst.getOpcode(), *MCII)) // If this instruction is a prefix, inserting a prefix would change // semantic. return false; - if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) + if (IsRightAfterData) // If this instruction follows any data, there is no clear // instruction boundary, inserting a nop/prefix would change semantic. return false; @@ -509,16 +510,24 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const { /// Insert BoundaryAlignFragment before instructions to align branches. void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst, const MCSubtargetInfo &STI) { - CanPadInst = canPadInst(Inst, OS); + // Used by canPadInst. Done here, because in emitInstructionEnd, the current + // fragment will have changed. + IsRightAfterData = + isRightAfterData(OS.getCurrentFragment(), PrevInstPosition); if (!canPadBranches(OS)) return; + // NB: PrevInst only valid if canPadBranches is true. if (!isMacroFused(PrevInst, Inst)) // Macro fusion doesn't happen indeed, clear the pending. PendingBA = nullptr; - if (!CanPadInst) + // When branch padding is enabled (basically the skx102 erratum => unlikely), + // we call canPadInst (not cheap) twice. However, in the common case, we can + // avoid unnecessary calls to that, as this is otherwise only used for + // relaxable fragments. + if (!canPadInst(Inst, OS)) return; if (PendingBA && PendingBA->getNextNode() == OS.getCurrentFragment()) { @@ -552,16 +561,22 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, } /// Set the last fragment to be aligned for the BoundaryAlignFragment. -void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { - PrevInst = Inst; +void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, + const MCInst &Inst) { MCFragment *CF = OS.getCurrentFragment(); - PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); if (auto *F = dyn_cast_or_null(CF)) - F->setAllowAutoPadding(CanPadInst); + F->setAllowAutoPadding(canPadInst(Inst, OS)); + + // Update PrevInstOpcode here, canPadInst() reads that. + PrevInstOpcode = Inst.getOpcode(); + PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); if (!canPadBranches(OS)) return; + // PrevInst is only needed if canPadBranches. Copying an MCInst isn't cheap. + PrevInst = Inst; + if (!needAlign(Inst) || !PendingBA) return; From 2dc2290860355dd2bac3b655eea895fe30fde257 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 12:15:09 +0100 Subject: [PATCH 46/82] Revert new debug info format commits: "[Flang] Update test to not check for tail calls on debug intrinsics" & "Reapply#3 "[RemoveDIs] Load into new debug info format by default in LLVM (#89799)" Recent updates to flang have added debug info generation via MLIR, a path which currently does not support debug records. The patch that enables debug records by default (and a small followup patch) are thus being reverted until the MLIR path has been fixed. This reverts commits: 21396be865b4640abf6afa0b05de6708a1a996e0 c5aeca732d1ff6769b0659efebd1cfb5f60487e4 --- clang/test/CodeGen/instrument-objc-method.m | 8 +- flang/test/Transforms/debug-local-var-2.f90 | 16 +- llvm/docs/ReleaseNotes.rst | 7 - llvm/include/llvm/AsmParser/LLParser.h | 1 + llvm/lib/AsmParser/LLParser.cpp | 34 +-- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/IR/BasicBlock.cpp | 2 +- llvm/lib/IR/DebugProgramInstruction.cpp | 4 +- llvm/lib/IR/Function.cpp | 4 +- llvm/lib/IR/Module.cpp | 4 +- llvm/tools/llvm-as/llvm-as.cpp | 7 +- llvm/tools/llvm-dis/llvm-dis.cpp | 2 +- llvm/tools/llvm-link/llvm-link.cpp | 8 +- .../Analysis/IRSimilarityIdentifierTest.cpp | 22 +- llvm/unittests/IR/BasicBlockDbgInfoTest.cpp | 68 ++++++ llvm/unittests/IR/DebugInfoTest.cpp | 73 +++--- llvm/unittests/IR/IRBuilderTest.cpp | 12 +- llvm/unittests/IR/InstructionsTest.cpp | 6 - llvm/unittests/IR/ValueTest.cpp | 9 +- .../Transforms/Utils/CloningTest.cpp | 5 +- llvm/unittests/Transforms/Utils/LocalTest.cpp | 211 ++++++++---------- 21 files changed, 260 insertions(+), 245 deletions(-) diff --git a/clang/test/CodeGen/instrument-objc-method.m b/clang/test/CodeGen/instrument-objc-method.m index 2c9d1fc88554bd..cfc0a0a98bec6b 100644 --- a/clang/test/CodeGen/instrument-objc-method.m +++ b/clang/test/CodeGen/instrument-objc-method.m @@ -11,16 +11,16 @@ @implementation ObjCClass + (void)initialize { } -// BARE: @"\01+[ObjCClass load]"{{\(.*\)}} #1 +// PREINLINE: declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 +// BARE: @"\01+[ObjCClass load]"{{\(.*\)}} #2 + (void)load __attribute__((no_instrument_function)) { } -// PREINLINE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #1 -// BARE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #1 +// PREINLINE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #2 +// BARE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #2 - (void)dealloc __attribute__((no_instrument_function)) { } -// PREINLINE: declare void @llvm.dbg.declare(metadata, metadata, metadata) #2 // PREINLINE: attributes #0 = { {{.*}}"instrument-function-entry"="__cyg_profile_func_enter" // PREINLINE-NOT: attributes #0 = { {{.*}}"instrument-function-entry"="__cyg_profile_func_enter_bare" // PREINLINE-NOT: attributes #2 = { {{.*}}"__cyg_profile_func_enter" diff --git a/flang/test/Transforms/debug-local-var-2.f90 b/flang/test/Transforms/debug-local-var-2.f90 index ee60a07cc4bee6..0fe1b81c27e61e 100644 --- a/flang/test/Transforms/debug-local-var-2.f90 +++ b/flang/test/Transforms/debug-local-var-2.f90 @@ -20,20 +20,20 @@ ! CHECK-LABEL: define {{.*}}i64 @_QFPfn1 ! CHECK-SAME: (ptr %[[ARG1:.*]], ptr %[[ARG2:.*]], ptr %[[ARG3:.*]]) -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression()) -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression()) -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression()) ! CHECK-DAG: %[[AL2:.*]] = alloca i64 -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression()) ! CHECK-LABEL: } ! CHECK-LABEL: define {{.*}}i32 @_QFPfn2 ! CHECK-SAME: (ptr %[[FN2ARG1:.*]], ptr %[[FN2ARG2:.*]], ptr %[[FN2ARG3:.*]]) -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression()) -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression()) -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression()) ! CHECK-DAG: %[[AL3:.*]] = alloca i32 -! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression()) +! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression()) ! CHECK-LABEL: } program mn diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index b46994bbcd66de..00e2969ee3543b 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -226,13 +226,6 @@ Changes to the Metadata Info Changes to the Debug Info --------------------------------- -* LLVM has switched from using debug intrinsics internally to using debug - records by default. This should happen transparently when using the DIBuilder - to construct debug variable information, but will require changes for any code - that interacts with debug intrinsics directly. Debug intrinsics will only be - supported on a best-effort basis from here onwards; for more information, see - the `migration docs `_. - Changes to the LLVM tools --------------------------------- * llvm-nm and llvm-objdump can now print symbol information from linked diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h index e687254f6c4c70..b2dcdfad0a04b4 100644 --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -337,6 +337,7 @@ namespace llvm { // Top-Level Entities bool parseTopLevelEntities(); + bool finalizeDebugInfoFormat(Module *M); void dropUnknownMetadataReferences(); bool validateEndOfModule(bool UpgradeDebugInfo); bool validateEndOfIndex(); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index eb1e3e494a42f4..f0fde9ae4df5c3 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -74,6 +74,23 @@ static std::string getTypeString(Type *T) { return Tmp.str(); } +// Whatever debug info format we parsed, we should convert to the expected debug +// info format immediately afterwards. +bool LLParser::finalizeDebugInfoFormat(Module *M) { + // We should have already returned an error if we observed both intrinsics and + // records in this IR. + assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) && + "Mixed debug intrinsics/records seen without a parsing error?"); + if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) { + UseNewDbgInfoFormat = SeenNewDbgInfoFormat; + WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat; + WriteNewDbgInfoFormat = SeenNewDbgInfoFormat; + } else if (M) { + M->setIsNewDbgInfoFormat(false); + } + return false; +} + /// Run: module ::= toplevelentity* bool LLParser::Run(bool UpgradeDebugInfo, DataLayoutCallbackTy DataLayoutCallback) { @@ -91,7 +108,7 @@ bool LLParser::Run(bool UpgradeDebugInfo, } return parseTopLevelEntities() || validateEndOfModule(UpgradeDebugInfo) || - validateEndOfIndex(); + validateEndOfIndex() || finalizeDebugInfoFormat(M); } bool LLParser::parseStandaloneConstantValue(Constant *&C, @@ -190,18 +207,6 @@ void LLParser::dropUnknownMetadataReferences() { bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { if (!M) return false; - - // We should have already returned an error if we observed both intrinsics and - // records in this IR. - assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) && - "Mixed debug intrinsics/records seen without a parsing error?"); - if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) { - UseNewDbgInfoFormat = SeenNewDbgInfoFormat; - WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat; - WriteNewDbgInfoFormat = SeenNewDbgInfoFormat; - M->setNewDbgInfoFormatFlag(SeenNewDbgInfoFormat); - } - // Handle any function attribute group forward references. for (const auto &RAG : ForwardRefAttrGroups) { Value *V = RAG.first; @@ -434,9 +439,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { UpgradeModuleFlags(*M); UpgradeSectionAttributes(*M); - if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE) - M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); - if (!Slots) return false; // Initialize the slot mapping. diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 40852a6fd404b5..c667913005cd80 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -4357,7 +4357,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit, if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE) { TheModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat && - LoadBitcodeIntoNewDbgInfoFormat != cl::boolOrDefault::BOU_FALSE; + LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_TRUE; } this->ValueTypeCallback = std::move(Callbacks.ValueType); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index aea9425ebebaab..29f2cbf611fa3a 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -181,7 +181,7 @@ template class llvm::SymbolTableListTraits(getRawLocation())) || - (getNumVariableLocationOps() == 0 && !getExpression()->isComplex()) || + return (getNumVariableLocationOps() == 0 && + !getExpression()->isComplex()) || any_of(location_ops(), [](Value *V) { return isa(V); }); } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 9360e6d7d274c8..3f735020e87402 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -83,8 +83,6 @@ static cl::opt NonGlobalValueMaxNameSize( "non-global-value-max-name-size", cl::Hidden, cl::init(1024), cl::desc("Maximum size for the name of non-global values.")); -extern cl::opt UseNewDbgInfoFormat; - void Function::convertToNewDbgValues() { IsNewDbgInfoFormat = true; for (auto &BB : *this) { @@ -443,7 +441,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, : GlobalObject(Ty, Value::FunctionVal, OperandTraits::op_begin(this), 0, Linkage, name, computeAddrSpace(AddrSpace, ParentModule)), - NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) { + NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(false) { assert(FunctionType::isValidReturnType(getReturnType()) && "invalid return type"); setGlobalObjectSubClassData(0); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 55c282cb25e793..f97dd18c736c51 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -54,8 +54,6 @@ using namespace llvm; -extern cl::opt UseNewDbgInfoFormat; - //===----------------------------------------------------------------------===// // Methods to implement the globals and functions lists. // @@ -74,7 +72,7 @@ template class llvm::SymbolTableListTraits; Module::Module(StringRef MID, LLVMContext &C) : Context(C), ValSymTab(std::make_unique(-1)), ModuleID(std::string(MID)), SourceFileName(std::string(MID)), DL(""), - IsNewDbgInfoFormat(UseNewDbgInfoFormat) { + IsNewDbgInfoFormat(false) { Context.addModule(this); } diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp index 0958e16c2197ac..e48e3f4d22c123 100644 --- a/llvm/tools/llvm-as/llvm-as.cpp +++ b/llvm/tools/llvm-as/llvm-as.cpp @@ -142,10 +142,11 @@ int main(int argc, char **argv) { } // Convert to new debug format if requested. - M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat && - WriteNewDbgInfoFormatToBitcode); - if (M->IsNewDbgInfoFormat) + assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug mode"); + if (UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode) { + M->convertToNewDbgValues(); M->removeDebugIntrinsicDeclarations(); + } std::unique_ptr Index = std::move(ModuleAndIndex.Index); diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp index d28af85bc739eb..fbbb5506e43e05 100644 --- a/llvm/tools/llvm-dis/llvm-dis.cpp +++ b/llvm/tools/llvm-dis/llvm-dis.cpp @@ -258,7 +258,7 @@ int main(int argc, char **argv) { // All that llvm-dis does is write the assembly to a file. if (!DontPrint) { if (M) { - M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat); + ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat); if (WriteNewDbgInfoFormat) M->removeDebugIntrinsicDeclarations(); M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder); diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp index b84469d1c757f8..7794f2d81ed064 100644 --- a/llvm/tools/llvm-link/llvm-link.cpp +++ b/llvm/tools/llvm-link/llvm-link.cpp @@ -489,6 +489,12 @@ int main(int argc, char **argv) { if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET) LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE; + // RemoveDIs debug-info transition: tests may request that we /try/ to use the + // new debug-info format. + if (TryUseNewDbgInfoFormat) { + // Turn the new debug-info format on. + UseNewDbgInfoFormat = true; + } // Since llvm-link collects multiple IR modules together, for simplicity's // sake we disable the "PreserveInputDbgFormat" flag to enforce a single // debug info format. @@ -550,7 +556,7 @@ int main(int argc, char **argv) { SetFormat(WriteNewDbgInfoFormat); Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder); } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) { - SetFormat(UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode); + SetFormat(WriteNewDbgInfoFormatToBitcode); WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder); } diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp index 24f4f11db9a8b9..f6a053792f8529 100644 --- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp +++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/IRSimilarityIdentifier.h" -#include "llvm/ADT/ScopeExit.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -23,11 +22,6 @@ using namespace llvm; using namespace IRSimilarity; -extern llvm::cl::opt UseNewDbgInfoFormat; -extern cl::opt PreserveInputDbgFormat; -extern bool WriteNewDbgInfoFormatToBitcode; -extern cl::opt WriteNewDbgInfoFormat; - static std::unique_ptr makeLLVMModule(LLVMContext &Context, StringRef ModuleStr) { SMDiagnostic Err; @@ -1312,18 +1306,19 @@ TEST(IRInstructionMapper, CallBrInstIllegal) { ASSERT_GT(UnsignedVec[0], Mapper.IllegalInstrNumber); } -// Checks that an debuginfo records are mapped to be invisible. Since they +// Checks that an debuginfo intrinsics are mapped to be invisible. Since they // do not semantically change the program, they can be recognized as similar. TEST(IRInstructionMapper, DebugInfoInvisible) { StringRef ModuleString = R"( define i32 @f(i32 %a, i32 %b) { then: - %0 = add i32 %a, %b - #dbg_value(i32 0, !0, !0, !0) - %1 = add i32 %a, %b + %0 = add i32 %a, %b + call void @llvm.dbg.value(metadata !0) + %1 = add i32 %a, %b ret i32 0 } + declare void @llvm.dbg.value(metadata) !0 = distinct !{!"test\00", i32 10})"; LLVMContext Context; std::unique_ptr M = makeLLVMModule(Context, ModuleString); @@ -1919,19 +1914,19 @@ TEST(IRSimilarityCandidate, CheckRegionsDifferentTypes) { ASSERT_FALSE(longSimCandCompare(InstrList)); } -// Check that debug records do not impact similarity. They are marked as +// Check that debug instructions do not impact similarity. They are marked as // invisible. TEST(IRSimilarityCandidate, IdenticalWithDebug) { StringRef ModuleString = R"( define i32 @f(i32 %a, i32 %b) { bb0: %0 = add i32 %a, %b - #dbg_value(i32 0, !0, !0, !0) + call void @llvm.dbg.value(metadata !0) %1 = add i32 %b, %a ret i32 0 bb1: %2 = add i32 %a, %b - #dbg_value(i32 1, !1, !1, !1) + call void @llvm.dbg.value(metadata !1) %3 = add i32 %b, %a ret i32 0 bb2: @@ -1940,6 +1935,7 @@ TEST(IRSimilarityCandidate, IdenticalWithDebug) { ret i32 0 } + declare void @llvm.dbg.value(metadata) !0 = distinct !{!"test\00", i32 10} !1 = distinct !{!"test\00", i32 11})"; LLVMContext Context; diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp index 91a0745a0cc76e..f873bbd4293af5 100644 --- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp +++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp @@ -25,6 +25,8 @@ using namespace llvm; +extern cl::opt UseNewDbgInfoFormat; + static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { SMDiagnostic Err; std::unique_ptr Mod = parseAssemblyString(IR, Err, C); @@ -42,6 +44,8 @@ namespace { // by DbgVariableRecords, the dbg.value replacement. TEST(BasicBlockDbgInfoTest, InsertAfterSelf) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11 @@ -68,6 +72,8 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) { !11 = !DILocation(line: 1, column: 1, scope: !6) )"); + // Convert the module to "new" form debug-info. + M->convertToNewDbgValues(); // Fetch the entry block. BasicBlock &BB = M->getFunction("f")->getEntryBlock(); @@ -97,10 +103,16 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) { EXPECT_TRUE(RetInst->hasDbgRecords()); auto Range2 = RetInst->getDbgRecordRange(); EXPECT_EQ(std::distance(Range2.begin(), Range2.end()), 1u); + + M->convertFromNewDbgValues(); + + UseNewDbgInfoFormat = false; } TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"---( define dso_local void @func() #0 !dbg !10 { %1 = alloca i32, align 4 @@ -138,6 +150,8 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) { )---"); ASSERT_TRUE(M); + M->convertToNewDbgValues(); + Function *F = M->getFunction("func"); BasicBlock &BB = F->getEntryBlock(); @@ -147,10 +161,14 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) { BasicBlock &BBBefore = F->getEntryBlock(); auto I2 = std::prev(BBBefore.end(), 2); ASSERT_TRUE(I2->hasDbgRecords()); + + UseNewDbgInfoFormat = false; } TEST(BasicBlockDbgInfoTest, MarkerOperations) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11 @@ -178,6 +196,8 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) { // Fetch the entry block, BasicBlock &BB = M->getFunction("f")->getEntryBlock(); + // Convert the module to "new" form debug-info. + M->convertToNewDbgValues(); EXPECT_EQ(BB.size(), 2u); // Fetch out our two markers, @@ -275,10 +295,14 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) { // Teardown, Instr1->insertBefore(BB, BB.begin()); + + UseNewDbgInfoFormat = false; } TEST(BasicBlockDbgInfoTest, HeadBitOperations) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { %b = add i16 %a, 1, !dbg !11 @@ -308,6 +332,8 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) { // Test that the movement of debug-data when using moveBefore etc and // insertBefore etc are governed by the "head" bit of iterators. BasicBlock &BB = M->getFunction("f")->getEntryBlock(); + // Convert the module to "new" form debug-info. + M->convertToNewDbgValues(); // Test that the head bit behaves as expected: it should be set when the // code wants the _start_ of the block, but not otherwise. @@ -378,10 +404,14 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) { DInst->DebugMarker->StoredDbgRecords.empty()); EXPECT_FALSE(CInst->DebugMarker->StoredDbgRecords.empty()); EXPECT_EQ(&*BB.begin(), CInst); + + UseNewDbgInfoFormat = false; } TEST(BasicBlockDbgInfoTest, InstrDbgAccess) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { %b = add i16 %a, 1, !dbg !11 @@ -411,6 +441,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) { // Check that DbgVariableRecords can be accessed from Instructions without // digging into the depths of DbgMarkers. BasicBlock &BB = M->getFunction("f")->getEntryBlock(); + // Convert the module to "new" form debug-info. + M->convertToNewDbgValues(); Instruction *BInst = &*BB.begin(); Instruction *CInst = BInst->getNextNode(); @@ -451,6 +483,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) { CInst->dropOneDbgRecord(DVR1); EXPECT_FALSE(CInst->hasDbgRecords()); EXPECT_EQ(CInst->DebugMarker->StoredDbgRecords.size(), 0u); + + UseNewDbgInfoFormat = false; } /* Let's recall the big illustration from BasicBlock::spliceDebugInfo: @@ -543,7 +577,9 @@ class DbgSpliceTest : public ::testing::Test { DbgVariableRecord *DVRA, *DVRB, *DVRConst; void SetUp() override { + UseNewDbgInfoFormat = true; M = parseIR(C, SpliceTestIR.c_str()); + M->convertToNewDbgValues(); BBEntry = &M->getFunction("f")->getEntryBlock(); BBExit = BBEntry->getNextNode(); @@ -563,6 +599,8 @@ class DbgSpliceTest : public ::testing::Test { cast(&*CInst->DebugMarker->StoredDbgRecords.begin()); } + void TearDown() override { UseNewDbgInfoFormat = false; } + bool InstContainsDbgVariableRecord(Instruction *I, DbgVariableRecord *DVR) { for (DbgRecord &D : I->getDbgRecordRange()) { if (&D == DVR) { @@ -1149,6 +1187,8 @@ metadata !9, metadata !DIExpression()), !dbg !11 Dest %c = add i16 %b, 1, // then the trailing DbgVariableRecords should get flushed back out. TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { entry: @@ -1179,6 +1219,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) { BasicBlock &Entry = M->getFunction("f")->getEntryBlock(); BasicBlock &Exit = *Entry.getNextNode(); + M->convertToNewDbgValues(); // Begin by forcing entry block to have dangling DbgVariableRecord. Entry.getTerminator()->eraseFromParent(); @@ -1193,6 +1234,8 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) { Instruction *BInst = &*Entry.begin(); ASSERT_TRUE(BInst->DebugMarker); EXPECT_EQ(BInst->DebugMarker->StoredDbgRecords.size(), 1u); + + UseNewDbgInfoFormat = false; } // When we remove instructions from the program, adjacent DbgVariableRecords @@ -1201,6 +1244,8 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) { // dbg.values. Test that this can be replicated correctly by DbgVariableRecords TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { entry: @@ -1228,6 +1273,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) { )"); BasicBlock &Entry = M->getFunction("f")->getEntryBlock(); + M->convertToNewDbgValues(); // Fetch the relevant instructions from the converted function. Instruction *SubInst = &*Entry.begin(); @@ -1270,12 +1316,16 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) { EXPECT_EQ(std::distance(R4.begin(), R4.end()), 1u); auto R5 = RetInst->getDbgRecordRange(); EXPECT_EQ(std::distance(R5.begin(), R5.end()), 1u); + + UseNewDbgInfoFormat = false; } // Test instruction removal and re-insertion, this time with one // DbgVariableRecord that should hop up one instruction. TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { entry: @@ -1302,6 +1352,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) { )"); BasicBlock &Entry = M->getFunction("f")->getEntryBlock(); + M->convertToNewDbgValues(); // Fetch the relevant instructions from the converted function. Instruction *SubInst = &*Entry.begin(); @@ -1340,6 +1391,8 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) { EXPECT_FALSE(RetInst->hasDbgRecords()); auto R3 = AddInst->getDbgRecordRange(); EXPECT_EQ(std::distance(R3.begin(), R3.end()), 1u); + + UseNewDbgInfoFormat = false; } // Similar to the above, what if we splice into an empty block with debug-info, @@ -1348,6 +1401,8 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) { // of the i16 0 dbg.value. TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { entry: @@ -1381,6 +1436,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) { Function &F = *M->getFunction("f"); BasicBlock &Entry = F.getEntryBlock(); BasicBlock &Exit = *Entry.getNextNode(); + M->convertToNewDbgValues(); // Begin by forcing entry block to have dangling DbgVariableRecord. Entry.getTerminator()->eraseFromParent(); @@ -1407,12 +1463,16 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) { // No trailing DbgVariableRecords in the entry block now. EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr); + + UseNewDbgInfoFormat = false; } // Similar test again, but this time: splice the contents of exit into entry, // with the intention of leaving the first dbg.value (i16 0) behind. TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { entry: @@ -1446,6 +1506,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) { Function &F = *M->getFunction("f"); BasicBlock &Entry = F.getEntryBlock(); BasicBlock &Exit = *Entry.getNextNode(); + M->convertToNewDbgValues(); // Begin by forcing entry block to have dangling DbgVariableRecord. Entry.getTerminator()->eraseFromParent(); @@ -1476,12 +1537,16 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) { EXPECT_FALSE(Exit.getTrailingDbgRecords()->empty()); Exit.getTrailingDbgRecords()->eraseFromParent(); Exit.deleteTrailingDbgRecords(); + + UseNewDbgInfoFormat = false; } // What if we moveBefore end() -- there might be no debug-info there, in which // case we shouldn't crash. TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) { LLVMContext C; + UseNewDbgInfoFormat = true; + std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { entry: @@ -1511,6 +1576,7 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) { Function &F = *M->getFunction("f"); BasicBlock &Entry = F.getEntryBlock(); BasicBlock &Exit = *Entry.getNextNode(); + M->convertToNewDbgValues(); // Move the return to the end of the entry block. Instruction *Br = Entry.getTerminator(); @@ -1523,6 +1589,8 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) { EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr); EXPECT_EQ(Exit.getTrailingDbgRecords(), nullptr); EXPECT_FALSE(Ret->hasDbgRecords()); + + UseNewDbgInfoFormat = false; } } // End anonymous namespace. diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index cac8acbe15a79d..ec3f33318f8cdb 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -156,7 +156,7 @@ TEST(StripTest, LoopMetadata) { EXPECT_FALSE(BrokenDebugInfo); } -TEST(MetadataTest, DeleteInstUsedByDbgRecord) { +TEST(MetadataTest, DeleteInstUsedByDbgValue) { LLVMContext C; std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { @@ -187,13 +187,12 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) { // Find the dbg.value using %b. SmallVector DVIs; - SmallVector DVRs; - findDbgValues(DVIs, &I, &DVRs); + findDbgValues(DVIs, &I); // Delete %b. The dbg.value should now point to undef. I.eraseFromParent(); - EXPECT_EQ(DVRs[0]->getNumVariableLocationOps(), 1u); - EXPECT_TRUE(isa(DVRs[0]->getValue(0))); + EXPECT_EQ(DVIs[0]->getNumVariableLocationOps(), 1u); + EXPECT_TRUE(isa(DVIs[0]->getValue(0))); } TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) { @@ -231,8 +230,8 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) { // Get the dbg.declare. Function &F = *cast(M->getNamedValue("fun")); - DbgVariableRecord *DbgDeclare = - cast(&*F.front().front().getDbgRecordRange().begin()); + DbgVariableIntrinsic *DbgDeclare = + cast(&F.front().front()); // Check that this form counts as a "no location" marker. EXPECT_TRUE(DbgDeclare->isKillLocation()); } @@ -240,9 +239,6 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) { // Duplicate of above test, but in DbgVariableRecord representation. TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) { LLVMContext C; - bool OldDbgValueMode = UseNewDbgInfoFormat; - UseNewDbgInfoFormat = true; - std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { %b = add i16 %a, 1, !dbg !11 @@ -268,7 +264,10 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) { !11 = !DILocation(line: 1, column: 1, scope: !6) )"); + bool OldDbgValueMode = UseNewDbgInfoFormat; + UseNewDbgInfoFormat = true; Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); + M->convertToNewDbgValues(); // Find the DbgVariableRecords using %b. SmallVector DVIs; @@ -290,8 +289,6 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) { // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and // their corresponding DbgVariableRecord representation, are consistent. TEST(MetadataTest, OrderingOfDbgVariableRecords) { - bool OldDbgValueMode = UseNewDbgInfoFormat; - UseNewDbgInfoFormat = false; LLVMContext C; std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { @@ -319,6 +316,8 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) { !12 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 1, type: !10) )"); + bool OldDbgValueMode = UseNewDbgInfoFormat; + UseNewDbgInfoFormat = true; Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI(); SmallVector DVIs; @@ -516,15 +515,14 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) { Value *V1 = Fun.getArg(0); Value *P1 = Fun.getArg(1); Value *P2 = Fun.getArg(2); - DbgVariableRecord *DbgAssign = cast( - &*Fun.front().front().getDbgRecordRange().begin()); - ASSERT_TRUE(V1 == DbgAssign->getVariableLocationOp(0)); - ASSERT_TRUE(P1 == DbgAssign->getAddress()); + DbgAssignIntrinsic *DAI = cast(Fun.begin()->begin()); + ASSERT_TRUE(V1 == DAI->getVariableLocationOp(0)); + ASSERT_TRUE(P1 == DAI->getAddress()); #define TEST_REPLACE(Old, New, ExpectedValue, ExpectedAddr) \ - DbgAssign->replaceVariableLocationOp(Old, New); \ - EXPECT_EQ(DbgAssign->getVariableLocationOp(0), ExpectedValue); \ - EXPECT_EQ(DbgAssign->getAddress(), ExpectedAddr); + DAI->replaceVariableLocationOp(Old, New); \ + EXPECT_EQ(DAI->getVariableLocationOp(0), ExpectedValue); \ + EXPECT_EQ(DAI->getAddress(), ExpectedAddr); // Replace address only. TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2); @@ -535,8 +533,8 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) { // Replace address only, value uses a DIArgList. // Value = {DIArgList(V1)}, Addr = P1. - DbgAssign->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1))); - DbgAssign->setExpression(DIExpression::get( + DAI->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1))); + DAI->setExpression(DIExpression::get( C, {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_stack_value})); TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2); #undef TEST_REPLACE @@ -622,11 +620,11 @@ TEST(AssignmentTrackingTest, Utils) { // // Check there are two llvm.dbg.assign intrinsics linked to Alloca. auto CheckFun1Mapping = [&Alloca]() { - auto Markers = at::getDVRAssignmentMarkers(&Alloca); + auto Markers = at::getAssignmentMarkers(&Alloca); EXPECT_TRUE(std::distance(Markers.begin(), Markers.end()) == 2); // Check those two entries are distinct. - DbgVariableRecord *First = *Markers.begin(); - DbgVariableRecord *Second = *std::next(Markers.begin()); + DbgAssignIntrinsic *First = *Markers.begin(); + DbgAssignIntrinsic *Second = *std::next(Markers.begin()); EXPECT_NE(First, Second); // Check that we can get back to Alloca from each llvm.dbg.assign. @@ -662,7 +660,7 @@ TEST(AssignmentTrackingTest, Utils) { DIAssignID *Fun2ID = cast_or_null( Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID)); EXPECT_NE(New, Fun2ID); - auto Fun2Markers = at::getDVRAssignmentMarkers(&Fun2Alloca); + auto Fun2Markers = at::getAssignmentMarkers(&Fun2Alloca); ASSERT_TRUE(std::distance(Fun2Markers.begin(), Fun2Markers.end()) == 1); auto Fun2Insts = at::getAssignmentInsts(*Fun2Markers.begin()); ASSERT_TRUE(std::distance(Fun2Insts.begin(), Fun2Insts.end()) == 1); @@ -671,10 +669,10 @@ TEST(AssignmentTrackingTest, Utils) { // 3. Check that deleting dbg.assigns from a specific instruction works. Instruction &Fun3Alloca = *M->getFunction("fun3")->getEntryBlock().getFirstNonPHIOrDbg(); - auto Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca); + auto Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca); ASSERT_TRUE(std::distance(Fun3Markers.begin(), Fun3Markers.end()) == 1); at::deleteAssignmentMarkers(&Fun3Alloca); - Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca); + Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca); EXPECT_EQ(Fun3Markers.empty(), true); // 4. Check that deleting works and applies only to the target function. @@ -685,7 +683,7 @@ TEST(AssignmentTrackingTest, Utils) { // llvm.dbg.assign. EXPECT_EQ(Fun2ID, cast_or_null( Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID))); - EXPECT_FALSE(at::getDVRAssignmentMarkers(&Fun2Alloca).empty()); + EXPECT_FALSE(at::getAssignmentMarkers(&Fun2Alloca).empty()); } TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) { @@ -771,12 +769,12 @@ TEST(AssignmentTrackingTest, InstrMethods) { // Use SetVectors to check that the attachments and markers are unique // (another test requirement). SetVector OrigIDs; - SetVector Markers; + SetVector Markers; for (const Instruction *SI : Stores) { Metadata *ID = SI->getMetadata(LLVMContext::MD_DIAssignID); ASSERT_TRUE(OrigIDs.insert(ID)); ASSERT_TRUE(ID != nullptr); - auto Range = at::getDVRAssignmentMarkers(SI); + auto Range = at::getAssignmentMarkers(SI); ASSERT_TRUE(std::distance(Range.begin(), Range.end()) == 1); ASSERT_TRUE(Markers.insert(*Range.begin())); } @@ -869,8 +867,6 @@ TEST(AssignmentTrackingTest, InstrMethods) { // dbg.values that have been converted to a non-instruction format. TEST(MetadataTest, ConvertDbgToDbgVariableRecord) { LLVMContext C; - bool OldDbgValueMode = UseNewDbgInfoFormat; - UseNewDbgInfoFormat = false; std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11 @@ -1045,14 +1041,14 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) { // The record of those trailing DbgVariableRecords would dangle and cause an // assertion failure if it lived until the end of the LLVMContext. ExitBlock->deleteTrailingDbgRecords(); - UseNewDbgInfoFormat = OldDbgValueMode; } TEST(MetadataTest, DbgVariableRecordConversionRoutines) { LLVMContext C; - bool OldDbgValueMode = UseNewDbgInfoFormat; - UseNewDbgInfoFormat = false; + // For the purpose of this test, set and un-set the command line option + // corresponding to UseNewDbgInfoFormat. + UseNewDbgInfoFormat = true; std::unique_ptr M = parseIR(C, R"( define i16 @f(i16 %a) !dbg !6 { @@ -1083,11 +1079,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) { !11 = !DILocation(line: 1, column: 1, scope: !6) )"); - // For the purpose of this test, set and un-set the command line option - // corresponding to UseNewDbgInfoFormat, but only after parsing, to ensure - // that the IR starts off in the old format. - UseNewDbgInfoFormat = true; - // Check that the conversion routines and utilities between dbg.value // debug-info format and DbgVariableRecords works. Function *F = M->getFunction("f"); @@ -1192,7 +1183,7 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) { EXPECT_EQ(DVI2->getVariable(), DLV2); EXPECT_EQ(DVI2->getExpression(), Expr2); - UseNewDbgInfoFormat = OldDbgValueMode; + UseNewDbgInfoFormat = false; } // Test that the hashing function for DISubprograms representing methods produce diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index ff96df85812002..2001df090aed53 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -994,17 +994,17 @@ TEST_F(IRBuilderTest, DIBuilder) { EXPECT_TRUE(verifyModule(*M)); }; - // Test in new-debug mode. - EXPECT_TRUE(M->IsNewDbgInfoFormat); + // Test in old-debug mode. + EXPECT_FALSE(M->IsNewDbgInfoFormat); RunTest(); - // Test in old-debug mode. - // Reset the test then call convertFromNewDbgValues to flip the flag + // Test in new-debug mode. + // Reset the test then call convertToNewDbgValues to flip the flag // on the test's Module, Function and BasicBlock. TearDown(); SetUp(); - M->convertFromNewDbgValues(); - EXPECT_FALSE(M->IsNewDbgInfoFormat); + M->convertToNewDbgValues(); + EXPECT_TRUE(M->IsNewDbgInfoFormat); RunTest(); } diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp index b6044b28629204..b47c73f0b329ae 100644 --- a/llvm/unittests/IR/InstructionsTest.cpp +++ b/llvm/unittests/IR/InstructionsTest.cpp @@ -25,15 +25,12 @@ #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/SourceMgr.h" #include "llvm-c/Core.h" #include "gmock/gmock-matchers.h" #include "gtest/gtest.h" #include -extern llvm::cl::opt UseNewDbgInfoFormat; - namespace llvm { namespace { @@ -1463,8 +1460,6 @@ TEST(InstructionsTest, GetSplat) { TEST(InstructionsTest, SkipDebug) { LLVMContext C; - bool OldDbgValueMode = UseNewDbgInfoFormat; - UseNewDbgInfoFormat = false; std::unique_ptr M = parseIR(C, R"( declare void @llvm.dbg.value(metadata, metadata, metadata) @@ -1500,7 +1495,6 @@ TEST(InstructionsTest, SkipDebug) { // After the terminator, there are no non-debug instructions. EXPECT_EQ(nullptr, Term->getNextNonDebugInstruction()); - UseNewDbgInfoFormat = OldDbgValueMode; } TEST(InstructionsTest, PhiMightNotBeFPMathOperator) { diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp index 33a86d510d45cb..246c2fc7fe4063 100644 --- a/llvm/unittests/IR/ValueTest.cpp +++ b/llvm/unittests/IR/ValueTest.cpp @@ -13,7 +13,6 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSlotTracker.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" using namespace llvm; @@ -256,8 +255,6 @@ TEST(ValueTest, getLocalSlotDeath) { TEST(ValueTest, replaceUsesOutsideBlock) { // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside // BB, including dbg.* uses of MetadataAsValue(ValueAsMetadata(this)). - bool OldDbgValueMode = UseNewDbgInfoFormat; - UseNewDbgInfoFormat = false; const auto *IR = R"( define i32 @f() !dbg !6 { entry: @@ -318,7 +315,6 @@ TEST(ValueTest, replaceUsesOutsideBlock) { // These users are outside Entry so should be changed. ASSERT_TRUE(ExitDbg->getValue(0) == cast(B)); ASSERT_TRUE(Ret->getOperand(0) == cast(B)); - UseNewDbgInfoFormat = OldDbgValueMode; } TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) { @@ -363,6 +359,10 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) { if (!M) Err.print("ValueTest", errs()); + bool OldDbgValueMode = UseNewDbgInfoFormat; + UseNewDbgInfoFormat = true; + M->convertToNewDbgValues(); + auto GetNext = [](auto *I) { return &*++I->getIterator(); }; Function *F = M->getFunction("f"); @@ -389,6 +389,7 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) { EXPECT_TRUE(DVR1->getVariableLocationOp(0) == cast(A)); // These users are outside Entry so should be changed. EXPECT_TRUE(DVR2->getVariableLocationOp(0) == cast(B)); + UseNewDbgInfoFormat = OldDbgValueMode; } } // end anonymous namespace diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp index 1d0d56a2099ceb..5e302d9c0a0d3e 100644 --- a/llvm/unittests/Transforms/Utils/CloningTest.cpp +++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp @@ -844,9 +844,8 @@ TEST(CloneFunction, CloneFunctionWithInlinedSubprograms) { EXPECT_FALSE(verifyModule(*ImplModule, &errs())); // Check that DILexicalBlock of inlined function was not cloned. - auto DbgDeclareI = Func->begin()->begin()->getDbgRecordRange().begin(); - auto ClonedDbgDeclareI = - ClonedFunc->begin()->begin()->getDbgRecordRange().begin(); + auto DbgDeclareI = Func->begin()->begin(); + auto ClonedDbgDeclareI = ClonedFunc->begin()->begin(); const DebugLoc &DbgLoc = DbgDeclareI->getDebugLoc(); const DebugLoc &ClonedDbgLoc = ClonedDbgDeclareI->getDebugLoc(); EXPECT_NE(DbgLoc.get(), ClonedDbgLoc.get()); diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 316d59a9d22969..9b1176765c17f1 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/ScopeExit.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/PostDominators.h" @@ -27,27 +26,6 @@ using namespace llvm; -extern llvm::cl::opt UseNewDbgInfoFormat; -extern cl::opt PreserveInputDbgFormat; -extern bool WriteNewDbgInfoFormatToBitcode; -extern cl::opt WriteNewDbgInfoFormat; - -// Backup all of the existing settings that may be modified when -// PreserveInputDbgFormat=true, so that when the test is finished we return them -// (and the "preserve" setting) to their original values. -static auto SaveDbgInfoFormat() { - return make_scope_exit( - [OldPreserveInputDbgFormat = PreserveInputDbgFormat.getValue(), - OldUseNewDbgInfoFormat = UseNewDbgInfoFormat.getValue(), - OldWriteNewDbgInfoFormatToBitcode = WriteNewDbgInfoFormatToBitcode, - OldWriteNewDbgInfoFormat = WriteNewDbgInfoFormat.getValue()] { - PreserveInputDbgFormat = OldPreserveInputDbgFormat; - UseNewDbgInfoFormat = OldUseNewDbgInfoFormat; - WriteNewDbgInfoFormatToBitcode = OldWriteNewDbgInfoFormatToBitcode; - WriteNewDbgInfoFormat = OldWriteNewDbgInfoFormat; - }); -} - TEST(Local, RecursivelyDeleteDeadPHINodes) { LLVMContext C; @@ -138,6 +116,7 @@ static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { TEST(Local, ReplaceDbgDeclare) { LLVMContext C; + // Original C source to get debug info for a local variable: // void f() { int x; } std::unique_ptr M = parseIR(C, @@ -145,11 +124,11 @@ TEST(Local, ReplaceDbgDeclare) { define void @f() !dbg !8 { entry: %x = alloca i32, align 4 - #dbg_declare(ptr %x, !11, !DIExpression(), !13) - #dbg_declare(ptr %x, !11, !DIExpression(), !13) + call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13 + call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13 ret void, !dbg !14 } - + declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) @@ -172,18 +151,20 @@ TEST(Local, ReplaceDbgDeclare) { Instruction *Inst = &F->front().front(); auto *AI = dyn_cast(Inst); ASSERT_TRUE(AI); - + Inst = Inst->getNextNode()->getNextNode(); + ASSERT_TRUE(Inst); + auto *DII = dyn_cast(Inst); + ASSERT_TRUE(DII); Value *NewBase = Constant::getNullValue(PointerType::getUnqual(C)); DIBuilder DIB(*M); replaceDbgDeclare(AI, NewBase, DIB, DIExpression::ApplyOffset, 0); - // There should be exactly two dbg.declares, attached to the terminator. - Inst = F->front().getTerminator(); - ASSERT_TRUE(Inst); - EXPECT_TRUE(Inst->hasDbgRecords()); - EXPECT_EQ(range_size(Inst->getDbgRecordRange()), 2u); - for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange())) - EXPECT_EQ(DVR.getAddress(), NewBase); + // There should be exactly two dbg.declares. + int Declares = 0; + for (const Instruction &I : F->front()) + if (isa(I)) + Declares++; + EXPECT_EQ(2, Declares); } /// Build the dominator tree for the function and run the Test. @@ -518,10 +499,11 @@ struct SalvageDebugInfoTest : ::testing::Test { entry: %x = add i32 0, 1 %y = add i32 %x, 2 - #dbg_value(i32 %x, !11, !DIExpression(), !13) - #dbg_value(i32 %y, !11, !DIExpression(), !13) + call void @llvm.dbg.value(metadata i32 %x, metadata !11, metadata !DIExpression()), !dbg !13 + call void @llvm.dbg.value(metadata i32 %y, metadata !11, metadata !DIExpression()), !dbg !13 ret void, !dbg !14 } + declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) @@ -544,47 +526,48 @@ struct SalvageDebugInfoTest : ::testing::Test { ASSERT_TRUE(F); } - bool doesDebugValueDescribeX(const DbgVariableRecord &DVR) { - if (DVR.getNumVariableLocationOps() != 1u) + bool doesDebugValueDescribeX(const DbgValueInst &DI) { + if (DI.getNumVariableLocationOps() != 1u) return false; - const auto &CI = *cast(DVR.getValue(0)); + const auto &CI = *cast(DI.getValue(0)); if (CI.isZero()) - return DVR.getExpression()->getElements().equals( + return DI.getExpression()->getElements().equals( {dwarf::DW_OP_plus_uconst, 1, dwarf::DW_OP_stack_value}); else if (CI.isOneValue()) - return DVR.getExpression()->getElements().empty(); + return DI.getExpression()->getElements().empty(); return false; } - bool doesDebugValueDescribeY(const DbgVariableRecord &DVR) { - if (DVR.getNumVariableLocationOps() != 1u) + bool doesDebugValueDescribeY(const DbgValueInst &DI) { + if (DI.getNumVariableLocationOps() != 1u) return false; - const auto &CI = *cast(DVR.getVariableLocationOp(0)); + const auto &CI = *cast(DI.getVariableLocationOp(0)); if (CI.isZero()) - return DVR.getExpression()->getElements().equals( + return DI.getExpression()->getElements().equals( {dwarf::DW_OP_plus_uconst, 3, dwarf::DW_OP_stack_value}); else if (CI.isOneValue()) - return DVR.getExpression()->getElements().equals( + return DI.getExpression()->getElements().equals( {dwarf::DW_OP_plus_uconst, 2, dwarf::DW_OP_stack_value}); return false; } void verifyDebugValuesAreSalvaged() { - // The function should only contain debug values and a terminator. - EXPECT_EQ(F->size(), 1u); - EXPECT_TRUE(F->begin()->begin()->isTerminator()); - // Check that the debug values for %x and %y are preserved. bool FoundX = false; bool FoundY = false; - for (DbgVariableRecord &DVR : - filterDbgVars(F->begin()->begin()->getDbgRecordRange())) { - EXPECT_EQ(DVR.getVariable()->getName(), "x"); - FoundX |= doesDebugValueDescribeX(DVR); - FoundY |= doesDebugValueDescribeY(DVR); + for (const Instruction &I : F->front()) { + auto DI = dyn_cast(&I); + if (!DI) { + // The function should only contain debug values and a terminator. + ASSERT_TRUE(I.isTerminator()); + continue; + } + EXPECT_EQ(DI->getVariable()->getName(), "x"); + FoundX |= doesDebugValueDescribeX(*DI); + FoundY |= doesDebugValueDescribeY(*DI); } - EXPECT_TRUE(FoundX); - EXPECT_TRUE(FoundY); + ASSERT_TRUE(FoundX); + ASSERT_TRUE(FoundY); } }; @@ -607,12 +590,6 @@ TEST_F(SalvageDebugInfoTest, RecursiveBlockSimplification) { TEST(Local, wouldInstructionBeTriviallyDead) { LLVMContext Ctx; - // FIXME: PreserveInputDbgFormat is set to true because this test has - // been written to expect debug intrinsics rather than debug records. - // TODO: This test doesn't have a DbgRecord equivalent form so delete - // it when debug intrinsics are removed. - auto SettingGuard = SaveDbgInfoFormat(); - PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE; std::unique_ptr M = parseIR(Ctx, R"( define dso_local void @fun() local_unnamed_addr #0 !dbg !9 { @@ -706,10 +683,12 @@ TEST(Local, FindDbgUsers) { R"( define dso_local void @fun(ptr %a) #0 !dbg !11 { entry: - #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19) + call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19 ret void } + declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3, !9} !llvm.ident = !{!10} @@ -736,13 +715,9 @@ TEST(Local, FindDbgUsers) { verifyModule(*M, &errs(), &BrokenDebugInfo); ASSERT_FALSE(BrokenDebugInfo); - // Convert to debug intrinsics as we want to test findDbgUsers and - // findDbgValue's debug-intrinsic-finding code here. - // TODO: Remove this test when debug intrinsics are removed. - M->convertFromNewDbgValues(); - Function &Fun = *cast(M->getNamedValue("fun")); Value *Arg = Fun.getArg(0); + SmallVector Users; // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns // only 1 pointer to it rather than 2. @@ -763,7 +738,7 @@ TEST(Local, FindDbgRecords) { R"( define dso_local void @fun(ptr %a) #0 !dbg !11 { entry: - #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19) + call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19 ret void } @@ -792,6 +767,9 @@ TEST(Local, FindDbgRecords) { bool BrokenDebugInfo = true; verifyModule(*M, &errs(), &BrokenDebugInfo); ASSERT_FALSE(BrokenDebugInfo); + bool NewDbgInfoFormat = UseNewDbgInfoFormat; + UseNewDbgInfoFormat = true; + M->convertToNewDbgValues(); Function &Fun = *cast(M->getNamedValue("fun")); Value *Arg = Fun.getArg(0); @@ -811,10 +789,12 @@ TEST(Local, FindDbgRecords) { findDbgValues(Vals, Arg, &Records); EXPECT_EQ(Vals.size(), 0u); EXPECT_EQ(Records.size(), 1u); + UseNewDbgInfoFormat = NewDbgInfoFormat; } TEST(Local, ReplaceAllDbgUsesWith) { using namespace llvm::dwarf; + LLVMContext Ctx; // Note: The datalayout simulates Darwin/x86_64. @@ -827,36 +807,39 @@ TEST(Local, ReplaceAllDbgUsesWith) { define void @f() !dbg !6 { entry: %a = add i32 0, 1, !dbg !15 + call void @llvm.dbg.value(metadata i32 %a, metadata !9, metadata !DIExpression()), !dbg !15 - #dbg_value(i32 %a, !9, !DIExpression(), !15) %b = add i64 0, 1, !dbg !16 + call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression()), !dbg !16 + call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul)), !dbg !16 + call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value)), !dbg !16 + call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 8)), !dbg !16 + call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8)), !dbg !16 + call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8)), !dbg !16 - #dbg_value(i64 %b, !11, !DIExpression(), !16) - #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul), !16) - #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value), !16) - #dbg_value(i64 %b, !11, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !16) - #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8), !16) - #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8), !16) - %c = inttoptr i64 0 to ptr, !dbg !17 + %c = inttoptr i64 0 to i64*, !dbg !17 + call void @llvm.dbg.declare(metadata i64* %c, metadata !13, metadata !DIExpression()), !dbg !17 - #dbg_declare(ptr %c, !13, !DIExpression(), !17) - %d = inttoptr i64 0 to ptr, !dbg !18 + %d = inttoptr i64 0 to i32*, !dbg !18 + call void @llvm.dbg.declare(metadata i32* %d, metadata !20, metadata !DIExpression()), !dbg !18 - #dbg_declare(ptr %d, !20, !DIExpression(), !18) %e = add <2 x i16> zeroinitializer, zeroinitializer + call void @llvm.dbg.value(metadata <2 x i16> %e, metadata !14, metadata !DIExpression()), !dbg !18 - #dbg_value(<2 x i16> %e, !14, !DIExpression(), !18) %f = call i32 @escape(i32 0) + call void @llvm.dbg.value(metadata i32 %f, metadata !9, metadata !DIExpression()), !dbg !15 - #dbg_value(i32 %f, !9, !DIExpression(), !15) %barrier = call i32 @escape(i32 0) %g = call i32 @escape(i32 %f) + call void @llvm.dbg.value(metadata i32 %g, metadata !9, metadata !DIExpression()), !dbg !15 - #dbg_value(i32 %g, !9, !DIExpression(), !15) ret void, !dbg !19 } + declare void @llvm.dbg.declare(metadata, metadata, metadata) + declare void @llvm.dbg.value(metadata, metadata, metadata) + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!5} @@ -911,47 +894,38 @@ TEST(Local, ReplaceAllDbgUsesWith) { EXPECT_TRUE(replaceAllDbgUsesWith(D, C, C, DT)); SmallVector CDbgVals; - SmallVector CDbgRecords; - findDbgUsers(CDbgVals, &C, &CDbgRecords); - EXPECT_EQ(0U, CDbgVals.size()); - EXPECT_EQ(2U, CDbgRecords.size()); - EXPECT_TRUE(all_of( - CDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); })); + findDbgUsers(CDbgVals, &C); + EXPECT_EQ(2U, CDbgVals.size()); + EXPECT_TRUE(all_of(CDbgVals, [](DbgVariableIntrinsic *DII) { + return isa(DII); + })); EXPECT_TRUE(replaceAllDbgUsesWith(C, D, D, DT)); SmallVector DDbgVals; - SmallVector DDbgRecords; - findDbgUsers(DDbgVals, &D, &DDbgRecords); - EXPECT_EQ(0U, DDbgVals.size()); - EXPECT_EQ(2U, DDbgRecords.size()); - EXPECT_TRUE(all_of( - DDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); })); + findDbgUsers(DDbgVals, &D); + EXPECT_EQ(2U, DDbgVals.size()); + EXPECT_TRUE(all_of(DDbgVals, [](DbgVariableIntrinsic *DII) { + return isa(DII); + })); // Introduce a use-before-def. Check that the dbg.value for %a is salvaged. EXPECT_TRUE(replaceAllDbgUsesWith(A, F_, F_, DT)); - EXPECT_FALSE(A.hasDbgRecords()); - EXPECT_TRUE(B.hasDbgRecords()); - DbgVariableRecord *BDbgVal = - cast(&*B.getDbgRecordRange().begin()); - EXPECT_EQ(BDbgVal->getNumVariableLocationOps(), 1u); - EXPECT_EQ(ConstantInt::get(A.getType(), 0), - BDbgVal->getVariableLocationOp(0)); + auto *ADbgVal = cast(A.getNextNode()); + EXPECT_EQ(ADbgVal->getNumVariableLocationOps(), 1u); + EXPECT_EQ(ConstantInt::get(A.getType(), 0), ADbgVal->getVariableLocationOp(0)); // Introduce a use-before-def. Check that the dbg.values for %f become undef. EXPECT_TRUE(replaceAllDbgUsesWith(F_, G, G, DT)); - DbgVariableRecord *BarrierDbgVal = - cast(&*Barrier.getDbgRecordRange().begin()); - EXPECT_EQ(BarrierDbgVal->getNumVariableLocationOps(), 1u); - EXPECT_TRUE(BarrierDbgVal->isKillLocation()); + auto *FDbgVal = cast(F_.getNextNode()); + EXPECT_EQ(FDbgVal->getNumVariableLocationOps(), 1u); + EXPECT_TRUE(FDbgVal->isKillLocation()); - SmallVector BarrierDbgVals; - SmallVector BarrierDbgRecs; - findDbgValues(BarrierDbgVals, &F_, &BarrierDbgRecs); - EXPECT_EQ(0U, BarrierDbgVals.size()); - EXPECT_EQ(0U, BarrierDbgRecs.size()); + SmallVector FDbgVals; + findDbgValues(FDbgVals, &F_); + EXPECT_EQ(0U, FDbgVals.size()); // Simulate i32 -> i64 conversion to test sign-extension. Here are some // interesting cases to handle: @@ -961,15 +935,13 @@ TEST(Local, ReplaceAllDbgUsesWith) { // 4-6) like (1-3), but with a fragment EXPECT_TRUE(replaceAllDbgUsesWith(B, A, A, DT)); - SmallVector BDbgVals; - SmallVector BDbgRecs; - findDbgValues(BDbgVals, &A, &BDbgRecs); - EXPECT_EQ(0U, BDbgVals.size()); - EXPECT_EQ(6U, BDbgRecs.size()); + SmallVector ADbgVals; + findDbgValues(ADbgVals, &A); + EXPECT_EQ(6U, ADbgVals.size()); // Check that %a has a dbg.value with a DIExpression matching \p Ops. auto hasADbgVal = [&](ArrayRef Ops) { - return any_of(BDbgRecs, [&](DbgVariableRecord *DVI) { + return any_of(ADbgVals, [&](DbgValueInst *DVI) { assert(DVI->getVariable()->getName() == "2"); return DVI->getExpression()->getElements() == Ops; }); @@ -1372,11 +1344,6 @@ TEST(Local, ExpressionForConstant) { TEST(Local, ReplaceDbgVariableRecord) { LLVMContext C; - // FIXME: PreserveInputDbgFormat is set to true because this test has - // been written to expect debug intrinsics rather than debug records; use the - // intrinsic format until we update the test checks. - auto SettingGuard = SaveDbgInfoFormat(); - PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE; // Test that RAUW also replaces the operands of DbgVariableRecord objects, // i.e. non-instruction stored debugging information. From 424188abe4956d51c852668d206dfc9919290fbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 11 Jun 2024 11:20:19 +0200 Subject: [PATCH 47/82] [clang][Interp][test] Add test for void* diagnostics changes --- clang/test/AST/Interp/cxx23.cpp | 16 ++++++++++++++-- clang/test/AST/Interp/cxx26.cpp | 10 ++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 clang/test/AST/Interp/cxx26.cpp diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp index 1efd784abbbe8f..d0991f3ffdff5e 100644 --- a/clang/test/AST/Interp/cxx23.cpp +++ b/clang/test/AST/Interp/cxx23.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: target={{.*}}-zos{{.*}} -// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all20 %s -// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref23,all %s +// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref,ref20,all,all20 %s +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref,ref23,all %s // RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=expected23,all %s -fexperimental-new-constant-interpreter @@ -200,3 +200,15 @@ namespace UndefinedThreeWay { static_assert(!(*test_a_threeway)(A(), A())); // all-error {{static assertion expression is not an integral constant expression}} \ // all-note {{undefined function 'operator<=>' cannot be used in a constant expression}} } + +/// FIXME: The new interpreter is missing the "initializer of q is not a constant expression" diagnostics.a +/// That's because the cast from void* to int* is considered fine, but diagnosed. So we don't consider +/// q to be uninitialized. +namespace VoidCast { + constexpr void* p = nullptr; + constexpr int* q = static_cast(p); // all-error {{must be initialized by a constant expression}} \ + // all-note {{cast from 'void *' is not allowed in a constant expression}} \ + // ref-note {{declared here}} + static_assert(q == nullptr); // ref-error {{not an integral constant expression}} \ + // ref-note {{initializer of 'q' is not a constant expression}} +} diff --git a/clang/test/AST/Interp/cxx26.cpp b/clang/test/AST/Interp/cxx26.cpp new file mode 100644 index 00000000000000..0b0e2b21e8201e --- /dev/null +++ b/clang/test/AST/Interp/cxx26.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -std=c++26 -fsyntax-only -fcxx-exceptions -verify=ref,both %s +// RUN: %clang_cc1 -std=c++26 -fsyntax-only -fcxx-exceptions -verify=expected,both %s -fexperimental-new-constant-interpreter + +// both-no-diagnostics + +namespace VoidCast { + constexpr void* p = nullptr; + constexpr int* q = static_cast(p); + static_assert(q == nullptr); +} From 00262ab7e3b409ab59d6c6c2c6462215a1f27c5f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Jun 2024 12:19:59 +0100 Subject: [PATCH 48/82] [X86] Pull out repeated SDLoc in various ADD/SUB/XOR folds. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 37 ++++++++++++------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2aec14e93d082d..2ed79385272fa8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -50184,12 +50184,12 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. -static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { +static SDValue combineAddOrSubToADCOrSBB(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG) { bool IsSub = N->getOpcode() == ISD::SUB; SDValue X = N->getOperand(0); SDValue Y = N->getOperand(1); EVT VT = N->getValueType(0); - SDLoc DL(N); if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG)) return ADCOrSBB; @@ -52718,7 +52718,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } -static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, +static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"); @@ -52758,7 +52758,6 @@ static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1)) return SDValue(); - SDLoc DL(N); EVT OpVT = VT; SDValue Op = OpCTLZ.getOperand(0); if (VT == MVT::i8) { @@ -52781,11 +52780,12 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc DL(N); // If this is SSE1 only convert to FXOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, - DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32, + DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } @@ -52805,7 +52805,7 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; - if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget)) + if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget)) return R; if (DCI.isBeforeLegalizeOps()) @@ -52826,8 +52826,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).getValueType().isVector() && N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) { - return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0), - N0.getOperand(0).getValueType())); + return DAG.getBitcast( + VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType())); } // Handle AVX512 mask widening. @@ -52837,8 +52837,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() && TLI.isTypeLegal(N0.getOperand(1).getValueType())) { return DAG.getNode( - ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), - DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()), + ISD::INSERT_SUBVECTOR, DL, VT, N0.getOperand(0), + DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()), N0.getOperand(2)); } @@ -52851,7 +52851,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, auto *N1C = dyn_cast(N1); auto *N001C = dyn_cast(TruncExtSrc.getOperand(1)); if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) { - SDLoc DL(N); SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT); SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT); return DAG.getNode(ISD::XOR, DL, VT, LHS, @@ -55419,7 +55418,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, /// Try to fold those constants into an 'add' instruction to reduce instruction /// count. We do this with CMOV rather the generic 'select' because there are /// earlier folds that may be used to turn select-of-constants into logic hacks. -static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, +static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If an operand is zero, add-of-0 gets simplified away, so that's clearly // better because we eliminate 1-2 instructions. This transform is still @@ -55451,7 +55451,6 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = N->getValueType(0); - SDLoc DL(N); SDValue FalseOp = Cmov.getOperand(0); SDValue TrueOp = Cmov.getOperand(1); @@ -55492,7 +55491,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); SDLoc DL(N); - if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget)) + if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget)) return Select; if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget)) @@ -55550,7 +55549,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, Op0.getOperand(0), Op0.getOperand(2)); } - return combineAddOrSubToADCOrSBB(N, DAG); + return combineAddOrSubToADCOrSBB(N, DL, DAG); } // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov @@ -55626,6 +55625,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); + SDLoc DL(N); // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. auto IsNonOpaqueConstant = [&](SDValue Op) { @@ -55645,7 +55645,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) && !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) { - SDLoc DL(N); EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT)); @@ -55676,14 +55675,14 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use"); SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0, Op1.getOperand(1), Op1.getOperand(2)); - return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0), + return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0), Op1.getOperand(0)); } - if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget)) + if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget)) return V; - if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) + if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG)) return V; return combineSubSetcc(N, DAG); From 264bcbe1c8b3e885d896f0181d1ed54a7a4467bd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Jun 2024 12:35:39 +0100 Subject: [PATCH 49/82] [InstCombine] Add #38139 test coverage --- llvm/test/Transforms/InstCombine/icmp-of-or-x.ll | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll index 304874645d5dc9..7ff111c42a9e06 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll @@ -399,3 +399,13 @@ define i1 @icmp_eq_x_invertable_y2(i8 %x, i8 %y) { %r = icmp eq i8 %yy, %or ret i1 %r } + +define i1 @PR38139(i8 %arg) { +; CHECK-LABEL: @PR38139( +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[ARG:%.*]], -64 +; CHECK-NEXT: ret i1 [[R]] +; + %masked = or i8 %arg, 192 + %r = icmp ne i8 %masked, %arg + ret i1 %r +} From efbd64cbd90f4cc4eb5b1166eb9840af3ec1bba8 Mon Sep 17 00:00:00 2001 From: donald chen Date: Tue, 11 Jun 2024 19:41:01 +0800 Subject: [PATCH 50/82] [mlir][arith] Delete unnecessary error logs (#94970) Function `getNeutralElement` already indicates "cannot find neutral element" by returning nullptr through the return value, and no additional error log needs to be output. --- mlir/lib/Dialect/Arith/IR/ArithOps.cpp | 1 - mlir/test/Dialect/Linalg/transform-tile-reduction.mlir | 1 - 2 files changed, 2 deletions(-) diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp index 5797c5681a5fdd..2f6647a2a27b15 100644 --- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp +++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp @@ -2544,7 +2544,6 @@ std::optional mlir::arith::getNeutralElement(Operation *op) { .Case([](arith::MulIOp op) { return AtomicRMWKind::muli; }) .Default([](Operation *op) { return std::nullopt; }); if (!maybeKind) { - op->emitError() << "Unknown neutral element for: " << *op; return std::nullopt; } diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir index f3cf7c4dffa05f..8feb3c2a2c306a 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir @@ -355,7 +355,6 @@ module { %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor) outs(%arg1 : tensor) { ^bb0(%in: f32, %out: f32): %1 = llvm.fmul %in, %in : f32 - // expected-error @below {{Unknown neutral element for:}} %2 = llvm.fadd %1, %out : f32 linalg.yield %2 : f32 } -> tensor From 9b225d01f8edf08153aa704f534a5a6676384fc2 Mon Sep 17 00:00:00 2001 From: Johannes Reifferscheid Date: Tue, 11 Jun 2024 13:44:29 +0200 Subject: [PATCH 51/82] Fix complex abs with nnan/ninf. (#95080) The current logic tests for inf/inf and 0/0 inputs using a NaN check. This doesn't work with all fastmath flags. With nnan and ninf, we can just check for a 0 maximum. With only nnan, we have to check for both cases separately. --- .../ComplexToStandard/ComplexToStandard.cpp | 45 ++++---- .../convert-to-standard.mlir | 106 +++++++++--------- 2 files changed, 79 insertions(+), 72 deletions(-) diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index d8150aeb828a59..6656be830989a4 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -40,31 +40,35 @@ Value computeAbs(Value real, Value imag, arith::FastMathFlags fmf, Value max = b.create(absReal, absImag, fmf); Value min = b.create(absReal, absImag, fmf); - Value ratio = b.create(min, max, fmf); - Value ratioSq = b.create(ratio, ratio, fmf); - Value ratioSqPlusOne = b.create(ratioSq, one, fmf); + + // The lowering below requires NaNs and infinities to work correctly. + arith::FastMathFlags fmfWithNaNInf = arith::bitEnumClear( + fmf, arith::FastMathFlags::nnan | arith::FastMathFlags::ninf); + Value ratio = b.create(min, max, fmfWithNaNInf); + Value ratioSq = b.create(ratio, ratio, fmfWithNaNInf); + Value ratioSqPlusOne = b.create(ratioSq, one, fmfWithNaNInf); Value result; if (fn == AbsFn::rsqrt) { - ratioSqPlusOne = b.create(ratioSqPlusOne, fmf); - min = b.create(min, fmf); - max = b.create(max, fmf); + ratioSqPlusOne = b.create(ratioSqPlusOne, fmfWithNaNInf); + min = b.create(min, fmfWithNaNInf); + max = b.create(max, fmfWithNaNInf); } if (fn == AbsFn::sqrt) { Value quarter = b.create( real.getType(), b.getFloatAttr(real.getType(), 0.25)); // sqrt(sqrt(a*b)) would avoid the pow, but will overflow more easily. - Value sqrt = b.create(max, fmf); - Value p025 = b.create(ratioSqPlusOne, quarter, fmf); - result = b.create(sqrt, p025, fmf); + Value sqrt = b.create(max, fmfWithNaNInf); + Value p025 = b.create(ratioSqPlusOne, quarter, fmfWithNaNInf); + result = b.create(sqrt, p025, fmfWithNaNInf); } else { - Value sqrt = b.create(ratioSqPlusOne, fmf); - result = b.create(max, sqrt, fmf); + Value sqrt = b.create(ratioSqPlusOne, fmfWithNaNInf); + result = b.create(max, sqrt, fmfWithNaNInf); } - Value isNaN = - b.create(arith::CmpFPredicate::UNO, result, result, fmf); + Value isNaN = b.create(arith::CmpFPredicate::UNO, result, + result, fmfWithNaNInf); return b.create(isNaN, min, result); } @@ -595,17 +599,20 @@ struct Log1pOpConversion : public OpConversionPattern { Value maxMinusOne = b.create(maxAbs, one, fmf); Value maxAbsOfRealPlusOneAndImagMinusOne = b.create(useReal, real, maxMinusOne); - Value minMaxRatio = b.create(minAbs, maxAbs, fmf); + arith::FastMathFlags fmfWithNaNInf = arith::bitEnumClear( + fmf, arith::FastMathFlags::nnan | arith::FastMathFlags::ninf); + Value minMaxRatio = b.create(minAbs, maxAbs, fmfWithNaNInf); Value logOfMaxAbsOfRealPlusOneAndImag = b.create(maxAbsOfRealPlusOneAndImagMinusOne, fmf); Value logOfSqrtPart = b.create( - b.create(minMaxRatio, minMaxRatio, fmf), fmf); + b.create(minMaxRatio, minMaxRatio, fmfWithNaNInf), + fmfWithNaNInf); Value r = b.create( - b.create(half, logOfSqrtPart, fmf), - logOfMaxAbsOfRealPlusOneAndImag, fmf); + b.create(half, logOfSqrtPart, fmfWithNaNInf), + logOfMaxAbsOfRealPlusOneAndImag, fmfWithNaNInf); Value resultReal = b.create( - b.create(arith::CmpFPredicate::UNO, r, r, fmf), minAbs, - r); + b.create(arith::CmpFPredicate::UNO, r, r, fmfWithNaNInf), + minAbs, r); Value resultImag = b.create(imag, realPlusOne, fmf); rewriter.replaceOpWithNewOp(op, type, resultReal, resultImag); diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir index 6dafe29e2e5f69..d7767bda08435f 100644 --- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir +++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s --convert-complex-to-standard --split-input-file |\ -// RUN: FileCheck %s --dump-input=always +// RUN: FileCheck %s // CHECK-LABEL: func @complex_abs // CHECK-SAME: %[[ARG:.*]]: complex @@ -703,14 +703,14 @@ func.func @complex_sqrt_nnan_ninf(%arg: complex) -> complex { // CHECK: %[[ABSIM:.*]] = math.absf %[[IM]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABSRE]], %[[ABSIM]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABSRE]], %[[ABSIM]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] : f32 // CHECK: %[[QUARTER:.*]] = arith.constant 2.500000e-01 : f32 -// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath : f32 -// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath : f32 -// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] : f32 +// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] : f32 +// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] : f32 // CHECK: %[[SQRT_ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[SQRT_ABS_OR_NAN]] : f32 // CHECK: %[[ARGARG:.*]] = math.atan2 %[[IM]], %[[RE]] fastmath : f32 // CHECK: %[[SQRTARG:.*]] = arith.mulf %[[ARGARG]], %[[HALF]] fastmath : f32 @@ -819,12 +819,12 @@ func.func @complex_abs_with_fmf(%arg: complex) -> f32 { // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 -// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 -// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32 // CHECK: return %[[ABS]] : f32 @@ -918,12 +918,12 @@ func.func @complex_log_with_fmf(%arg: complex) -> complex { // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 -// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 -// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[ABS]] fastmath : f32 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex @@ -952,14 +952,14 @@ func.func @complex_log1p_with_fmf(%arg: complex) -> complex { // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL_PLUS_ONE]], %[[ABS_IMAG]] fastmath : f32 // CHECK: %[[CMPF:.*]] = arith.cmpf ogt, %[[REAL_PLUS_ONE]], %[[ABS_IMAG]] fastmath : f32 // CHECK: %[[MAX_MINUS_ONE:.*]] = arith.subf %[[MAX]], %[[ONE]] fastmath : f32 -// CHECK: %[[SELECT:.*]] = arith.select %[[CMPF]], %0, %[[MAX_MINUS_ONE]] : f32 -// CHECK: %[[MIN_MAX_RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[SELECT:.*]] = arith.select %[[CMPF]], %[[REAL]], %[[MAX_MINUS_ONE]] : f32 +// CHECK: %[[MIN_MAX_RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 // CHECK: %[[LOG_1:.*]] = math.log1p %[[SELECT]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[MIN_MAX_RATIO]], %[[MIN_MAX_RATIO]] fastmath : f32 -// CHECK: %[[LOG_SQ:.*]] = math.log1p %[[RATIO_SQ]] fastmath : f32 -// CHECK: %[[HALF_LOG_SQ:.*]] = arith.mulf %cst, %[[LOG_SQ]] fastmath : f32 -// CHECK: %[[R:.*]] = arith.addf %[[HALF_LOG_SQ]], %[[LOG_1]] fastmath : f32 -// CHECK: %[[ISNAN:.*]] = arith.cmpf uno, %[[R]], %[[R]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[MIN_MAX_RATIO]], %[[MIN_MAX_RATIO]] fastmath : f32 +// CHECK: %[[LOG_SQ:.*]] = math.log1p %[[RATIO_SQ]] fastmath : f32 +// CHECK: %[[HALF_LOG_SQ:.*]] = arith.mulf %cst, %[[LOG_SQ]] fastmath : f32 +// CHECK: %[[R:.*]] = arith.addf %[[HALF_LOG_SQ]], %[[LOG_1]] fastmath : f32 +// CHECK: %[[ISNAN:.*]] = arith.cmpf uno, %[[R]], %[[R]] fastmath : f32 // CHECK: %[[RESULT_REAL:.*]] = arith.select %[[ISNAN]], %[[MIN]], %[[R]] : f32 // CHECK: %[[RESULT_IMAG:.*]] = math.atan2 %[[IMAG]], %[[REAL_PLUS_ONE]] fastmath : f32 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex @@ -1298,14 +1298,14 @@ func.func @complex_atan2_with_fmf(%lhs: complex, // CHECK: %[[ABSIM:.*]] = math.absf %[[IM]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABSRE]], %[[ABSIM]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABSRE]], %[[ABSIM]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 // CHECK: %[[QUARTER:.*]] = arith.constant 2.500000e-01 : f32 -// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath : f32 -// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath : f32 -// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath : f32 +// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath : f32 +// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath : f32 // CHECK: %[[SQRT_ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[SQRT_ABS_OR_NAN]] : f32 // CHECK: %[[ARGARG:.*]] = math.atan2 %[[IM]], %[[RE]] fastmath : f32 // CHECK: %[[SQRTARG:.*]] = arith.mulf %[[ARGARG]], %[[HALF]] fastmath : f32 @@ -1539,12 +1539,12 @@ func.func @complex_atan2_with_fmf(%lhs: complex, // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 -// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 -// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32 // CHECK: %[[VAR436:.*]] = math.log %[[ABS]] fastmath : f32 // CHECK: %[[VAR437:.*]] = complex.re %[[VAR415]] : complex @@ -1778,14 +1778,14 @@ func.func @complex_sqrt_with_fmf(%arg: complex) -> complex { // CHECK: %[[ABSIM:.*]] = math.absf %[[IM]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABSRE]], %[[ABSIM]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABSRE]], %[[ABSIM]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 // CHECK: %[[QUARTER:.*]] = arith.constant 2.500000e-01 : f32 -// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath : f32 -// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath : f32 -// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath : f32 +// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath : f32 +// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath : f32 // CHECK: %[[SQRT_ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[SQRT_ABS_OR_NAN]] : f32 // CHECK: %[[ARGARG:.*]] = math.atan2 %[[IM]], %[[RE]] fastmath : f32 // CHECK: %[[SQRTARG:.*]] = arith.mulf %[[ARGARG]], %[[HALF]] fastmath : f32 @@ -1886,12 +1886,12 @@ func.func @complex_sign_with_fmf(%arg: complex) -> complex { // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG2]] fastmath : f32 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath : f32 -// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 -// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 -// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 -// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 -// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 -// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 +// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath : f32 +// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath : f32 +// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath : f32 +// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath : f32 +// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath : f32 +// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath : f32 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32 // CHECK: %[[REAL_SIGN:.*]] = arith.divf %[[REAL]], %[[ABS]] fastmath : f32 // CHECK: %[[IMAG_SIGN:.*]] = arith.divf %[[IMAG]], %[[ABS]] fastmath : f32 From fa9745e8d39498a7090b108dd2717ca0466189e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 11 Jun 2024 11:20:41 +0200 Subject: [PATCH 52/82] [clang][Interp][NFC] Remove unneeded opcode initializers --- clang/lib/AST/Interp/Opcodes.td | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index 45fc11e5645767..df362efd8b58b2 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -139,7 +139,6 @@ class AluOpcode : Opcode { } class FloatOpcode : Opcode { - let Types = []; let Args = [ArgRoundingMode]; } @@ -195,17 +194,14 @@ def NoRet : Opcode {} def Call : Opcode { let Args = [ArgFunction, ArgUint32]; - let Types = []; } def CallVirt : Opcode { let Args = [ArgFunction, ArgUint32]; - let Types = []; } def CallBI : Opcode { let Args = [ArgFunction, ArgCallExpr]; - let Types = []; } def CallPtr : Opcode { @@ -214,7 +210,6 @@ def CallPtr : Opcode { def CallVar : Opcode { let Args = [ArgFunction, ArgUint32]; - let Types = []; } def OffsetOf : Opcode { @@ -399,8 +394,6 @@ def InitGlobalTemp : AccessOpcode { // [Pointer] -> [Pointer] def InitGlobalTempComp : Opcode { let Args = [ArgLETD]; - let Types = []; - let HasGroup = 0; } // [Value] -> [] def SetGlobal : AccessOpcode; @@ -505,13 +498,9 @@ def SubPtr : Opcode { } // [Pointer] -> [Pointer] -def IncPtr : Opcode { - let HasGroup = 0; -} +def IncPtr : Opcode; // [Pointer] -> [Pointer] -def DecPtr : Opcode { - let HasGroup = 0; -} +def DecPtr : Opcode; //===----------------------------------------------------------------------===// // Function pointers. @@ -607,7 +596,6 @@ def Cast: Opcode { } def CastFP : Opcode { - let Types = []; let Args = [ArgFltSemantics, ArgRoundingMode]; } @@ -642,12 +630,10 @@ def CastFloatingIntegral : Opcode { } def CastFloatingIntegralAP : Opcode { - let Types = []; let Args = [ArgUint32]; } def CastFloatingIntegralAPS : Opcode { - let Types = []; let Args = [ArgUint32]; } @@ -656,13 +642,9 @@ def CastPointerIntegral : Opcode { let HasGroup = 1; } def CastPointerIntegralAP : Opcode { - let Types = []; - let HasGroup = 0; let Args = [ArgUint32]; } def CastPointerIntegralAPS : Opcode { - let Types = []; - let HasGroup = 0; let Args = [ArgUint32]; } def PtrPtrCast : Opcode { From 3f883111243c4abfc06670190771b9cafc092bd8 Mon Sep 17 00:00:00 2001 From: Paul T Robinson Date: Tue, 11 Jun 2024 04:51:21 -0700 Subject: [PATCH 53/82] [Driver] Rearrange some Apple version testing (#94514) There were four tests in Driver that actually tested bits of Driver and bits of CodeGen, and therefore had target restrictions. Rework those four tests into one Driver test (with no target restrictions) and two target-specific CodeGen tests. --- clang/test/Driver/apple-os-triples.c | 31 ++++++++++++++++++++++ clang/test/Driver/appletvos-version-min.c | 8 ------ clang/test/Driver/driverkit-version-min.c | 5 ---- clang/test/Driver/ios-version-min.c | 7 ----- clang/test/Driver/watchos-version-min.c | 7 ----- llvm/test/CodeGen/ARM/apple-version-min.ll | 9 +++++++ llvm/test/CodeGen/X86/apple-version-min.ll | 12 +++++++++ 7 files changed, 52 insertions(+), 27 deletions(-) create mode 100644 clang/test/Driver/apple-os-triples.c delete mode 100644 clang/test/Driver/appletvos-version-min.c delete mode 100644 clang/test/Driver/driverkit-version-min.c delete mode 100644 clang/test/Driver/ios-version-min.c delete mode 100644 clang/test/Driver/watchos-version-min.c create mode 100644 llvm/test/CodeGen/ARM/apple-version-min.ll create mode 100644 llvm/test/CodeGen/X86/apple-version-min.ll diff --git a/clang/test/Driver/apple-os-triples.c b/clang/test/Driver/apple-os-triples.c new file mode 100644 index 00000000000000..7664d3bc19fca2 --- /dev/null +++ b/clang/test/Driver/apple-os-triples.c @@ -0,0 +1,31 @@ +// Test triple manipulations. + +// RUN: %clang -### -c %s \ +// RUN: --target=i386-apple-darwin10 -mappletvsimulator-version-min=9.0 -arch x86_64 2>&1 | \ +// RUN: FileCheck %s -DARCH=x86_64 -DOS=tvos9.0.0-simulator +// RUN: %clang -### -c %s \ +// RUN: --target=armv7s-apple-darwin10 -mappletvos-version-min=9.0 -arch arm64 2>&1 | \ +// RUN: FileCheck %s -DARCH=arm64 -DOS=tvos9.0.0 +// RUN: env TVOS_DEPLOYMENT_TARGET=9.0 %clang -### -c %s \ +// RUN: -isysroot SDKs/MacOSX10.9.sdk -target i386-apple-darwin10 -arch x86_64 2>&1 | \ +// RUN: FileCheck %s -DARCH=x86_64 -DOS=tvos9.0.0 + +// RUN: %clang -### -c %s \ +// RUN: --target=x86_64-apple-driverkit19.0 2>&1 | \ +// RUN: FileCheck %s -DARCH=x86_64 -DOS=driverkit19.0.0 + +// RUN: %clang -### -c %s \ +// RUN: --target=i386-apple-darwin10 -miphonesimulator-version-min=7.0 -arch i386 2>&1 | \ +// RUN: FileCheck %s -DARCH=i386 -DOS=ios7.0.0-simulator +// RUN: %clang -### -c %s \ +// RUN: --target=armv7s-apple-darwin10 -miphoneos-version-min=7.0 -arch armv7s 2>&1 | \ +// RUN: FileCheck %s -DARCH=thumbv7s -DOS=ios7.0.0 + +// RUN: %clang -### -c %s \ +// RUN: --target=i386-apple-darwin10 -mwatchsimulator-version-min=2.0 -arch i386 2>&1 | \ +// RUN: FileCheck %s -DARCH=i386 -DOS=watchos2.0.0-simulator +// RUN: %clang -### -c %s \ +// RUN: --target=armv7s-apple-darwin10 -mwatchos-version-min=2.0 -arch armv7k 2>&1 | \ +// RUN: FileCheck %s -DARCH=thumbv7k -DOS=watchos2.0.0 + +// CHECK: "-cc1" "-triple" "[[ARCH]]-apple-[[OS]]" diff --git a/clang/test/Driver/appletvos-version-min.c b/clang/test/Driver/appletvos-version-min.c deleted file mode 100644 index 7cbb2001a3ec21..00000000000000 --- a/clang/test/Driver/appletvos-version-min.c +++ /dev/null @@ -1,8 +0,0 @@ -// REQUIRES: x86-registered-target -// REQUIRES: aarch64-registered-target -// RUN: %clang -target i386-apple-darwin10 -mappletvsimulator-version-min=9.0 -arch x86_64 -S -o - %s | FileCheck %s -// RUN: %clang -target armv7s-apple-darwin10 -mappletvos-version-min=9.0 -arch arm64 -S -o - %s | FileCheck %s -// RUN: env TVOS_DEPLOYMENT_TARGET=9.0 %clang -isysroot SDKs/MacOSX10.9.sdk -target i386-apple-darwin10 -arch x86_64 -S -o - %s | FileCheck %s - -int main() { return 0; } -// CHECK: .tvos_version_min 9, 0 diff --git a/clang/test/Driver/driverkit-version-min.c b/clang/test/Driver/driverkit-version-min.c deleted file mode 100644 index 9966152f11ce82..00000000000000 --- a/clang/test/Driver/driverkit-version-min.c +++ /dev/null @@ -1,5 +0,0 @@ -// REQUIRES: x86-registered-target -// RUN: %clang -target x86_64-apple-driverkit19.0 -S -o - %s | FileCheck %s - -int main() { return 0; } -// CHECK: .build_version driverkit, 19, 0 diff --git a/clang/test/Driver/ios-version-min.c b/clang/test/Driver/ios-version-min.c deleted file mode 100644 index aa536cf7827b36..00000000000000 --- a/clang/test/Driver/ios-version-min.c +++ /dev/null @@ -1,7 +0,0 @@ -// REQUIRES: x86-registered-target -// REQUIRES: arm-registered-target -// RUN: %clang -target i386-apple-darwin10 -miphonesimulator-version-min=7.0 -arch i386 -S -o - %s | FileCheck %s -// RUN: %clang -target armv7s-apple-darwin10 -miphoneos-version-min=7.0 -arch armv7s -S -o - %s | FileCheck %s - -int main() { return 0; } -// CHECK: .ios_version_min 7, 0 diff --git a/clang/test/Driver/watchos-version-min.c b/clang/test/Driver/watchos-version-min.c deleted file mode 100644 index 8f12285d4e4737..00000000000000 --- a/clang/test/Driver/watchos-version-min.c +++ /dev/null @@ -1,7 +0,0 @@ -// REQUIRES: x86-registered-target -// REQUIRES: arm-registered-target -// RUN: %clang -target i386-apple-darwin10 -mwatchsimulator-version-min=2.0 -arch i386 -S -o - %s | FileCheck %s -// RUN: %clang -target armv7s-apple-darwin10 -mwatchos-version-min=2.0 -arch armv7k -S -o - %s | FileCheck %s - -int main() { return 0; } -// CHECK: .watchos_version_min 2, 0 diff --git a/llvm/test/CodeGen/ARM/apple-version-min.ll b/llvm/test/CodeGen/ARM/apple-version-min.ll new file mode 100644 index 00000000000000..6b4af21d74c00d --- /dev/null +++ b/llvm/test/CodeGen/ARM/apple-version-min.ll @@ -0,0 +1,9 @@ +; Test emitting version_min directives. + +; RUN: llc %s -filetype=asm -o - --mtriple arm64-apple-tvos9.0.0 | FileCheck %s --check-prefix=TVOS +; RUN: llc %s -filetype=asm -o - --mtriple thumbv7s-apple-ios7.0.0 | FileCheck %s --check-prefix=IOS +; RUN: llc %s -filetype=asm -o - --mtriple thumbv7k-apple-watchos2.0.0 | FileCheck %s --check-prefix=WATCHOS + +; TVOS: .tvos_version_min 9, 0 +; IOS: .ios_version_min 7, 0 +; WATCHOS: .watchos_version_min 2, 0 diff --git a/llvm/test/CodeGen/X86/apple-version-min.ll b/llvm/test/CodeGen/X86/apple-version-min.ll new file mode 100644 index 00000000000000..fde10ac3b42483 --- /dev/null +++ b/llvm/test/CodeGen/X86/apple-version-min.ll @@ -0,0 +1,12 @@ +; Test emitting version_min directives. + +; RUN: llc %s -filetype=asm -o - --mtriple x86_64-apple-tvos9.0.0-simulator | FileCheck %s --check-prefix=TVOS +; RUN: llc %s -filetype=asm -o - --mtriple x86_64-apple-tvos9.0.0 | FileCheck %s --check-prefix=TVOS +; RUN: llc %s -filetype=asm -o - --mtriple x86_64-apple-driverkit19.0.0 | FileCheck %s --check-prefix=DRIVERKIT +; RUN: llc %s -filetype=asm -o - --mtriple i386-apple-ios7.0.0-simulator | FileCheck %s --check-prefix=IOS +; RUN: llc %s -filetype=asm -o - --mtriple i386-apple-watchos2.0.0-simulator | FileCheck %s --check-prefix=WATCHOS + +; TVOS: .tvos_version_min 9, 0 +; DRIVERKIT: .build_version driverkit, 19, 0 +; IOS: .ios_version_min 7, 0 +; WATCHOS: .watchos_version_min 2, 0 From a141a28c0cf415d8ca410a636c3aacf3d683ab38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Tue, 11 Jun 2024 13:57:31 +0200 Subject: [PATCH 54/82] [SPIR-V] Fix flakiness during switch generation. (#95001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The case-list of the switches generated by this pass were not "deterministic" (based on allocation patterns). This is because the CaseList order relied on an unordered_set order. Using the sorted exit target list for those should solve the problem. Fixes #94961 Signed-off-by: Nathan Gauër --- .../SPIRV/SPIRVMergeRegionExitTargets.cpp | 25 +++++++++++-------- .../SPIRV/structurizer/merge-exit-break.ll | 2 +- .../merge-exit-convergence-in-break.ll | 2 +- .../structurizer/merge-exit-multiple-break.ll | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 2744c25d1bc754..52354281cdd7e3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -17,6 +17,8 @@ #include "SPIRVSubtarget.h" #include "SPIRVTargetMachine.h" #include "SPIRVUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/IR/CFG.h" @@ -71,7 +73,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { /// terminator will take. llvm::Value *createExitVariable( BasicBlock *BB, - const std::unordered_map &TargetToValue) { + const DenseMap &TargetToValue) { auto *T = BB->getTerminator(); if (isa(T)) return nullptr; @@ -103,7 +105,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { /// Replaces |BB|'s branch targets present in |ToReplace| with |NewTarget|. void replaceBranchTargets(BasicBlock *BB, - const std::unordered_set ToReplace, + const SmallPtrSet &ToReplace, BasicBlock *NewTarget) { auto *T = BB->getTerminator(); if (isa(T)) @@ -133,7 +135,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { bool runOnConvergenceRegionNoRecurse(LoopInfo &LI, const SPIRV::ConvergenceRegion *CR) { // Gather all the exit targets for this region. - std::unordered_set ExitTargets; + SmallPtrSet ExitTargets; for (BasicBlock *Exit : CR->Exits) { for (BasicBlock *Target : gatherSuccessors(Exit)) { if (CR->Blocks.count(Target) == 0) @@ -164,9 +166,10 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { // Creating one constant per distinct exit target. This will be route to the // correct target. - std::unordered_map TargetToValue; + DenseMap TargetToValue; for (BasicBlock *Target : SortedExitTargets) - TargetToValue.emplace(Target, Builder.getInt32(TargetToValue.size())); + TargetToValue.insert( + std::make_pair(Target, Builder.getInt32(TargetToValue.size()))); // Creating one variable per exit node, set to the constant matching the // targeted external block. @@ -184,12 +187,12 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { } // Creating the switch to jump to the correct exit target. - std::vector> CasesList( - TargetToValue.begin(), TargetToValue.end()); - llvm::SwitchInst *Sw = - Builder.CreateSwitch(node, CasesList[0].first, CasesList.size() - 1); - for (size_t i = 1; i < CasesList.size(); i++) - Sw->addCase(CasesList[i].second, CasesList[i].first); + llvm::SwitchInst *Sw = Builder.CreateSwitch(node, SortedExitTargets[0], + SortedExitTargets.size() - 1); + for (size_t i = 1; i < SortedExitTargets.size(); i++) { + BasicBlock *BB = SortedExitTargets[i]; + Sw->addCase(TargetToValue[BB], BB); + } // Fix exit branches to redirect to the new exit. for (auto Exit : CR->Exits) diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll index b3fcdc978625f9..e7b1b441405f61 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll @@ -66,7 +66,7 @@ while.end: ; CHECK: %[[#new_end]] = OpLabel ; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_1]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] -; CHECK: OpSwitch %[[#route]] %[[#while_end_loopexit]] 0 %[[#if_then]] +; CHECK: OpSwitch %[[#route]] %[[#if_then]] 1 %[[#while_end_loopexit]] } declare token @llvm.experimental.convergence.entry() #2 diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll index a67c58fdd5749f..593e3631c02b9d 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll @@ -75,7 +75,7 @@ while.end: ; CHECK: %[[#new_end]] = OpLabel ; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_0]] %[[#while_cond]] %[[#int_1]] %[[#tail]] -; CHECK: OpSwitch %[[#route]] %[[#while_end]] 0 %[[#while_end_loopexit]] +; CHECK: OpSwitch %[[#route]] %[[#while_end_loopexit]] 1 %[[#while_end]] } declare token @llvm.experimental.convergence.entry() #2 diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll index 32a97553df05e3..9806dd7955468e 100644 --- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll +++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll @@ -85,7 +85,7 @@ while.end: ; CHECK: %[[#new_end]] = OpLabel ; CHECK: %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_2]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] %[[#int_1]] %[[#if_end]] -; CHECK: OpSwitch %[[#route]] %[[#while_end_loopexit]] 1 %[[#if_then2]] 0 %[[#if_then]] +; CHECK: OpSwitch %[[#route]] %[[#if_then]] 1 %[[#if_then2]] 2 %[[#while_end_loopexit]] } declare token @llvm.experimental.convergence.entry() #2 From ca920bb6285e9995f5a202d040af79363e98ab28 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 13:11:41 +0100 Subject: [PATCH 55/82] [MLIR][Flang][DebugInfo] Set debug info format in MLIR->IR translation (#95098) MLIR's LLVM dialect does not internally support debug records, only converting to/from debug intrinsics. To smooth the transition from intrinsics to records, there is a step prior to IR->MLIR translation that switches the IR module to intrinsic-form; this patch adds the equivalent conversion to record-form at MLIR->IR translation, and also modifies the flang front end to use the WriteNewDbgInfoFormat flag when it is emitting LLVM IR. --- flang/lib/Frontend/FrontendActions.cpp | 9 +++++++++ mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index b1b6391f1439c6..a4db944e8c0abd 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -50,6 +50,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeWriterPass.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/IR/DebugProgramInstruction.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" @@ -81,6 +82,8 @@ using namespace Fortran::frontend; llvm::PassPluginLibraryInfo get##Ext##PluginInfo(); #include "llvm/Support/Extension.def" +extern llvm::cl::opt WriteNewDbgInfoFormat; + /// Save the given \c mlirModule to a temporary .mlir file, in a location /// decided by the -save-temps flag. No files are produced if the flag is not /// specified. @@ -1271,6 +1274,12 @@ void CodeGenAction::executeAction() { runOptimizationPipeline(ci.isOutputStreamNull() ? *os : ci.getOutputStream()); if (action == BackendActionTy::Backend_EmitLL) { + // When printing LLVM IR, we should convert the module to the debug info + // format that LLVM expects us to print. + llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, + WriteNewDbgInfoFormat); + if (WriteNewDbgInfoFormat) + llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(ci.isOutputStreamNull() ? *os : ci.getOutputStream(), /*AssemblyAnnotationWriter=*/nullptr); return; diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 7b86b250c294b4..e1a60f195fe89c 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -64,6 +64,8 @@ using namespace mlir; using namespace mlir::LLVM; using namespace mlir::LLVM::detail; +extern llvm::cl::opt UseNewDbgInfoFormat; + #include "mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc" namespace { @@ -1789,6 +1791,9 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext, StringRef name) { m->getContext()->getOrLoadDialect(); auto llvmModule = std::make_unique(name, llvmContext); + // ModuleTranslation can currently only construct modules in the old debug + // info format, so set the flag accordingly. + llvmModule->setNewDbgInfoFormatFlag(false); if (auto dataLayoutAttr = m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) { llvmModule->setDataLayout(cast(dataLayoutAttr).getValue()); @@ -1867,6 +1872,11 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, if (failed(translator.convertFunctions())) return nullptr; + // Once we've finished constructing elements in the module, we should convert + // it to use the debug info format desired by LLVM. + // See https://llvm.org/docs/RemoveDIsDebugInfo.html + translator.llvmModule->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); + if (!disableVerification && llvm::verifyModule(*translator.llvmModule, &llvm::errs())) return nullptr; From 546c816a529835a4cf89deecff957ea336a94fa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Tue, 11 Jun 2024 14:16:42 +0200 Subject: [PATCH 56/82] [clang-tidy] Improve sizeof(pointer) handling in bugprone-sizeof-expression (#94356) This commit reimplements the functionality of the Clang Static Analyzer checker `alpha.core.SizeofPointer` within clang-tidy by adding a new (off-by-default) option to bugprone-sizeof-expression which activates reporting all the `sizeof(ptr)` expressions (where ptr is an expression that produces a pointer). The main motivation for this change is that `alpha.core.SizeofPointer` was an AST-based checker, which did not rely on the path sensitive capabilities of the Static Analyzer, so there was no reason to keep it in the Static Analyzer instead of the more lightweight clang-tidy. After this commit I'm planning to create a separate commit that deletes `alpha.core.SizeofPointer` from Clang Static Analyzer. It was natural to place this moved logic in bugprone-sizeof-expression, because that check already provided several heuristics that reported various especially suspicious classes of `sizeof(ptr)` expressions. The new mode `WarnOnSizeOfPointer` is off-by-default, so it won't surprise the existing users; but it can provide a more through coverage for the vulnerability CWE-467 ("Use of sizeof() on a Pointer Type") than the existing partial heuristics. Previously this checker had an exception that the RHS of a `sizeof(array) / sizeof(array[0])` expression is not reported; I generalized this to an exception that the check doesn't report `sizeof(expr[0])` and `sizeof(*expr)`. This idea is taken from the Static Analyzer checker `alpha.core.SizeofPointer` (which had an exception for `*expr`), but analysis of open source projects confirmed that this indeed eliminates lots of unwanted results. Note that the suppression of `sizeof(expr[0])` and `sizeof(*expr)` reports also affects the "old" mode `WarnOnSizeOfPointerToAggregate` which is enabled by default. This commit also replaces the old message "suspicious usage of 'sizeof(A*)'; pointer to aggregate" with two more concrete messages; but I feel that this tidy check would deserve a through cleanup of all the diagnostic messages that it can produce. (I added a FIXME to mark one outright misleading message.) --- .../bugprone/SizeofExpressionCheck.cpp | 115 +++++---- .../bugprone/SizeofExpressionCheck.h | 1 + clang-tools-extra/docs/ReleaseNotes.rst | 6 + .../checks/bugprone/sizeof-expression.rst | 15 +- .../checkers/bugprone/sizeof-expression-2.c | 12 +- .../sizeof-expression-any-pointer.cpp | 241 ++++++++++++++++++ .../checkers/bugprone/sizeof-expression.cpp | 73 ++++-- 7 files changed, 388 insertions(+), 75 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp index 5e64d23874ec17..c25ee42d0899ae 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp @@ -67,7 +67,8 @@ SizeofExpressionCheck::SizeofExpressionCheck(StringRef Name, WarnOnSizeOfCompareToConstant( Options.get("WarnOnSizeOfCompareToConstant", true)), WarnOnSizeOfPointerToAggregate( - Options.get("WarnOnSizeOfPointerToAggregate", true)) {} + Options.get("WarnOnSizeOfPointerToAggregate", true)), + WarnOnSizeOfPointer(Options.get("WarnOnSizeOfPointer", false)) {} void SizeofExpressionCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "WarnOnSizeOfConstant", WarnOnSizeOfConstant); @@ -78,6 +79,7 @@ void SizeofExpressionCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { WarnOnSizeOfCompareToConstant); Options.store(Opts, "WarnOnSizeOfPointerToAggregate", WarnOnSizeOfPointerToAggregate); + Options.store(Opts, "WarnOnSizeOfPointer", WarnOnSizeOfPointer); } void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) { @@ -127,17 +129,30 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) { const auto ConstStrLiteralDecl = varDecl(isDefinition(), hasType(hasCanonicalType(CharPtrType)), hasInitializer(ignoringParenImpCasts(stringLiteral()))); + const auto VarWithConstStrLiteralDecl = expr( + hasType(hasCanonicalType(CharPtrType)), + ignoringParenImpCasts(declRefExpr(hasDeclaration(ConstStrLiteralDecl)))); Finder->addMatcher( - sizeOfExpr(has(ignoringParenImpCasts( - expr(hasType(hasCanonicalType(CharPtrType)), - ignoringParenImpCasts(declRefExpr( - hasDeclaration(ConstStrLiteralDecl))))))) + sizeOfExpr(has(ignoringParenImpCasts(VarWithConstStrLiteralDecl))) .bind("sizeof-charp"), this); - // Detect sizeof(ptr) where ptr points to an aggregate (i.e. sizeof(&S)). - // Do not find it if RHS of a 'sizeof(arr) / sizeof(arr[0])' expression. - if (WarnOnSizeOfPointerToAggregate) { + // Detect sizeof(ptr) where ptr is a pointer (CWE-467). + // + // In WarnOnSizeOfPointerToAggregate mode only report cases when ptr points + // to an aggregate type or ptr is an expression that (implicitly or + // explicitly) casts an array to a pointer type. (These are more suspicious + // than other sizeof(ptr) expressions because they can appear as distorted + // forms of the common sizeof(aggregate) expressions.) + // + // To avoid false positives, the check doesn't report expressions like + // 'sizeof(pp[0])' and 'sizeof(*pp)' where `pp` is a pointer-to-pointer or + // array of pointers. (This filters out both `sizeof(arr) / sizeof(arr[0])` + // expressions and other cases like `p = realloc(p, newsize * sizeof(*p));`.) + // + // Moreover this generic message is suppressed in cases that are also matched + // by the more concrete matchers 'sizeof-this' and 'sizeof-charp'. + if (WarnOnSizeOfPointerToAggregate || WarnOnSizeOfPointer) { const auto ArrayExpr = ignoringParenImpCasts(hasType(hasCanonicalType(arrayType()))); const auto ArrayCastExpr = expr(anyOf( @@ -149,32 +164,31 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) { const auto PointerToStructType = hasUnqualifiedDesugaredType(pointerType(pointee(recordType()))); - const auto PointerToStructExpr = expr( - hasType(hasCanonicalType(PointerToStructType)), unless(cxxThisExpr())); - - const auto ArrayOfPointersExpr = ignoringParenImpCasts( - hasType(hasCanonicalType(arrayType(hasElementType(pointerType())) - .bind("type-of-array-of-pointers")))); - const auto ArrayOfSamePointersExpr = - ignoringParenImpCasts(hasType(hasCanonicalType( - arrayType(equalsBoundNode("type-of-array-of-pointers"))))); + const auto PointerToStructTypeWithBinding = + type(PointerToStructType).bind("struct-type"); + const auto PointerToStructExpr = + expr(hasType(hasCanonicalType(PointerToStructType))); + + const auto PointerToDetectedExpr = + WarnOnSizeOfPointer + ? expr(hasType(hasUnqualifiedDesugaredType(pointerType()))) + : expr(anyOf(ArrayCastExpr, PointerToArrayExpr, + PointerToStructExpr)); + const auto ZeroLiteral = ignoringParenImpCasts(integerLiteral(equals(0))); - const auto ArrayOfSamePointersZeroSubscriptExpr = - ignoringParenImpCasts(arraySubscriptExpr( - hasBase(ArrayOfSamePointersExpr), hasIndex(ZeroLiteral))); - const auto ArrayLengthExprDenom = - expr(hasParent(binaryOperator(hasOperatorName("/"), - hasLHS(ignoringParenImpCasts(sizeOfExpr( - has(ArrayOfPointersExpr)))))), - sizeOfExpr(has(ArrayOfSamePointersZeroSubscriptExpr))); + const auto SubscriptExprWithZeroIndex = + arraySubscriptExpr(hasIndex(ZeroLiteral)); + const auto DerefExpr = + ignoringParenImpCasts(unaryOperator(hasOperatorName("*"))); Finder->addMatcher( - expr(sizeOfExpr(anyOf( - has(ignoringParenImpCasts(anyOf( - ArrayCastExpr, PointerToArrayExpr, PointerToStructExpr))), - has(PointerToStructType))), - unless(ArrayLengthExprDenom)) - .bind("sizeof-pointer-to-aggregate"), + expr(sizeOfExpr(anyOf(has(ignoringParenImpCasts( + expr(PointerToDetectedExpr, unless(DerefExpr), + unless(SubscriptExprWithZeroIndex), + unless(VarWithConstStrLiteralDecl), + unless(cxxThisExpr())))), + has(PointerToStructTypeWithBinding)))) + .bind("sizeof-pointer"), this); } @@ -292,11 +306,17 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { diag(E->getBeginLoc(), "suspicious usage of 'sizeof(char*)'; do you mean 'strlen'?") << E->getSourceRange(); - } else if (const auto *E = - Result.Nodes.getNodeAs("sizeof-pointer-to-aggregate")) { - diag(E->getBeginLoc(), - "suspicious usage of 'sizeof(A*)'; pointer to aggregate") - << E->getSourceRange(); + } else if (const auto *E = Result.Nodes.getNodeAs("sizeof-pointer")) { + if (Result.Nodes.getNodeAs("struct-type")) { + diag(E->getBeginLoc(), + "suspicious usage of 'sizeof(A*)' on pointer-to-aggregate type; did " + "you mean 'sizeof(A)'?") + << E->getSourceRange(); + } else { + diag(E->getBeginLoc(), "suspicious usage of 'sizeof()' on an expression " + "that results in a pointer") + << E->getSourceRange(); + } } else if (const auto *E = Result.Nodes.getNodeAs( "sizeof-compare-constant")) { diag(E->getOperatorLoc(), @@ -332,18 +352,23 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) { " numerator is not a multiple of denominator") << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); } else if (NumTy && DenomTy && NumTy == DenomTy) { + // FIXME: This message is wrong, it should not refer to sizeof "pointer" + // usage (and by the way, it would be to clarify all the messages). diag(E->getOperatorLoc(), "suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)'") << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); - } else if (PointedTy && DenomTy && PointedTy == DenomTy) { - diag(E->getOperatorLoc(), - "suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)'") - << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); - } else if (NumTy && DenomTy && NumTy->isPointerType() && - DenomTy->isPointerType()) { - diag(E->getOperatorLoc(), - "suspicious usage of sizeof pointer 'sizeof(P*)/sizeof(Q*)'") - << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); + } else if (!WarnOnSizeOfPointer) { + // When 'WarnOnSizeOfPointer' is enabled, these messages become redundant: + if (PointedTy && DenomTy && PointedTy == DenomTy) { + diag(E->getOperatorLoc(), + "suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)'") + << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); + } else if (NumTy && DenomTy && NumTy->isPointerType() && + DenomTy->isPointerType()) { + diag(E->getOperatorLoc(), + "suspicious usage of sizeof pointer 'sizeof(P*)/sizeof(Q*)'") + << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange(); + } } } else if (const auto *E = Result.Nodes.getNodeAs("sizeof-sizeof-expr")) { diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h index 55becdd4ecdba1..9ca17bc9e6f124 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h @@ -30,6 +30,7 @@ class SizeofExpressionCheck : public ClangTidyCheck { const bool WarnOnSizeOfThis; const bool WarnOnSizeOfCompareToConstant; const bool WarnOnSizeOfPointerToAggregate; + const bool WarnOnSizeOfPointer; }; } // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 0c0c10605a8307..2dc39d0ad74af8 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -237,6 +237,12 @@ Changes in existing checks ` check by eliminating false positives resulting from use of optionals in unevaluated context. +- Improved :doc:`bugprone-sizeof-expression + ` check by eliminating some + false positives and adding a new (off-by-default) option + `WarnOnSizeOfPointer` that reports all ``sizeof(pointer)`` expressions + (except for a few that are idiomatic). + - Improved :doc:`bugprone-suspicious-include ` check by replacing the local options `HeaderFileExtensions` and `ImplementationFileExtensions` by the diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst index c37df1706eb4e1..ed5bb4fbb89baf 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst @@ -190,6 +190,15 @@ Options .. option:: WarnOnSizeOfPointerToAggregate - When `true`, the check will warn on an expression like - ``sizeof(expr)`` where the expression is a pointer - to aggregate. Default is `true`. + When `true`, the check will warn when the argument of ``sizeof`` is either a + pointer-to-aggregate type, an expression returning a pointer-to-aggregate + value or an expression that returns a pointer from an array-to-pointer + conversion (that may be implicit or explicit, for example ``array + 2`` or + ``(int *)array``). Default is `true`. + +.. option:: WarnOnSizeOfPointer + + When `true`, the check will report all expressions where the argument of + ``sizeof`` is an expression that produces a pointer (except for a few + idiomatic expressions that are probably intentional and correct). + This detects occurrences of CWE 467. Default is `false`. diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c index 8c4feb8f86169b..aef930f2c8fda7 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c @@ -34,24 +34,24 @@ int Test5() { int sum = 0; sum += sizeof(&S); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(__typeof(&S)); sum += sizeof(&TS); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(__typeof(&TS)); sum += sizeof(STRKWD MyStruct*); sum += sizeof(__typeof(STRKWD MyStruct*)); sum += sizeof(TypedefStruct*); sum += sizeof(__typeof(TypedefStruct*)); sum += sizeof(PTTS); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(PMyStruct); sum += sizeof(PS); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(PS2); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(&A10); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer #ifdef __cplusplus MyStruct &rS = S; diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp new file mode 100644 index 00000000000000..bfb2ec3a9eb02c --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp @@ -0,0 +1,241 @@ +// RUN: %check_clang_tidy %s bugprone-sizeof-expression %t -- -config="{CheckOptions: {bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression: true, bugprone-sizeof-expression.WarnOnSizeOfPointer: true}}" -- + +class C { + int size() { return sizeof(this); } + // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(this)' +}; + +#define LEN 8 + +int X; +extern int A[10]; +extern short B[10]; + +#pragma pack(1) +struct S { char a, b, c; }; + +enum E { E_VALUE = 0 }; +enum class EC { VALUE = 0 }; + +bool AsBool() { return false; } +int AsInt() { return 0; } +E AsEnum() { return E_VALUE; } +EC AsEnumClass() { return EC::VALUE; } +S AsStruct() { return {}; } + +struct M { + int AsInt() { return 0; } + E AsEnum() { return E_VALUE; } + S AsStruct() { return {}; } +}; + +int Test1(const char* ptr) { + int sum = 0; + sum += sizeof(LEN); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(K)' + sum += sizeof(LEN + 1); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(K)' + sum += sizeof(sum, LEN); + // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: suspicious usage of 'sizeof(..., ...)' + sum += sizeof(AsBool()); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer + sum += sizeof(AsInt()); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer + sum += sizeof(AsEnum()); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer + sum += sizeof(AsEnumClass()); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer + sum += sizeof(M{}.AsInt()); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer + sum += sizeof(M{}.AsEnum()); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer + sum += sizeof(sizeof(X)); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))' + sum += sizeof(LEN + sizeof(X)); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))' + sum += sizeof(LEN + LEN + sizeof(X)); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))' + sum += sizeof(LEN + (LEN + sizeof(X))); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))' + sum += sizeof(LEN + -sizeof(X)); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))' + sum += sizeof(LEN + - + -sizeof(X)); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))' + sum += sizeof(char) / sizeof(char); + // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)' + sum += sizeof(A) / sizeof(S); + // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator + sum += sizeof(char) / sizeof(int); + // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator + sum += sizeof(char) / sizeof(A); + // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator + sum += sizeof(B[0]) / sizeof(A); + // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator + sum += sizeof(ptr) / sizeof(char); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(ptr) / sizeof(ptr[0]); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(ptr) / sizeof(char*); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(ptr) / sizeof(void*); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(ptr) / sizeof(const void volatile*); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(ptr) / sizeof(char); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(int) * sizeof(char); + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication + sum += sizeof(ptr) * sizeof(ptr[0]); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + // CHECK-MESSAGES: :[[@LINE-2]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication + sum += sizeof(int) * (2 * sizeof(char)); + // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication + sum += (2 * sizeof(char)) * sizeof(int); + // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious 'sizeof' by 'sizeof' multiplication + if (sizeof(A) < 0x100000) sum += 42; + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: suspicious comparison of 'sizeof(expr)' to a constant + if (sizeof(A) <= 0xFFFFFFFEU) sum += 42; + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: suspicious comparison of 'sizeof(expr)' to a constant + return sum; +} + +int Test5() { + typedef int Array10[10]; + typedef C ArrayC[10]; + + struct MyStruct { + Array10 arr; + Array10* ptr; + }; + typedef const MyStruct TMyStruct; + typedef const MyStruct *PMyStruct; + typedef TMyStruct *PMyStruct2; + + static TMyStruct kGlocalMyStruct = {}; + static TMyStruct volatile * kGlocalMyStructPtr = &kGlocalMyStruct; + + MyStruct S; + PMyStruct PS; + PMyStruct2 PS2; + Array10 A10; + C *PtrArray[10]; + C *PC; + + char *PChar; + int *PInt, **PPInt; + MyStruct **PPMyStruct; + + int sum = 0; + sum += sizeof(&S.arr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(&kGlocalMyStruct.arr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(&kGlocalMyStructPtr->arr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(S.arr + 0); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(+ S.arr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof((int*)S.arr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + + sum += sizeof(S.ptr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(kGlocalMyStruct.ptr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(kGlocalMyStructPtr->ptr); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + + sum += sizeof(&kGlocalMyStruct); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(&S); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(MyStruct*); + sum += sizeof(PMyStruct); + sum += sizeof(PS); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(PS2); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(&A10); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(PtrArray) / sizeof(PtrArray[1]); + // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(A10) / sizeof(PtrArray[0]); + sum += sizeof(PC) / sizeof(PtrArray[0]); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + // CHECK-MESSAGES: :[[@LINE-2]]:21: warning: suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)' + sum += sizeof(ArrayC) / sizeof(PtrArray[0]); + // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator + + sum += sizeof(PChar); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(PInt); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(PPInt); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(PPMyStruct); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + + return sum; +} + +void some_generic_function(const void *arg, int argsize); +int *IntP, **IntPP; +C *ClassP, **ClassPP; + +void GenericFunctionTest() { + // The `sizeof(pointer)` checks ignore situations where the pointer is + // produced by dereferencing a pointer-to-pointer, because this is unlikely + // to be an accident and can appear in legitimate code that tries to call + // a generic function which emulates dynamic typing within C. + some_generic_function(IntPP, sizeof(*IntPP)); + some_generic_function(ClassPP, sizeof(*ClassPP)); + // Using `...[0]` instead of the dereference operator is another common + // variant, which is also widespread in the idiomatic array-size calculation: + // `sizeof(array) / sizeof(array[0])`. + some_generic_function(IntPP, sizeof(IntPP[0])); + some_generic_function(ClassPP, sizeof(ClassPP[0])); + // FIXME: There is a third common pattern where the generic function is + // called with `&Variable` and `sizeof(Variable)`. Right now these are + // reported by the `sizeof(pointer)` checks, but this causes some false + // positives, so it would be good to create an exception for them. + some_generic_function(&IntPP, sizeof(IntP)); + // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + some_generic_function(&ClassPP, sizeof(ClassP)); + // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer +} + +int ValidExpressions() { + int A[] = {1, 2, 3, 4}; + static const char str[] = "hello"; + static const char* ptr[] { "aaa", "bbb", "ccc" }; + typedef C *CA10[10]; + C *PtrArray[10]; + CA10 PtrArray1; + + int sum = 0; + if (sizeof(A) < 10) + sum += sizeof(A); + sum += sizeof(int); + sum += sizeof(AsStruct()); + sum += sizeof(M{}.AsStruct()); + sum += sizeof(A[sizeof(A) / sizeof(int)]); + // Here the outer sizeof is reported, but the inner ones are accepted: + sum += sizeof(&A[sizeof(A) / sizeof(int)]); + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer + sum += sizeof(sizeof(0)); // Special case: sizeof size_t. + sum += sizeof(void*); + sum += sizeof(void const *); + sum += sizeof(void const *) / 4; + sum += sizeof(str); + sum += sizeof(str) / sizeof(char); + sum += sizeof(str) / sizeof(str[0]); + sum += sizeof(ptr) / sizeof(ptr[0]); + sum += sizeof(ptr) / sizeof(*(ptr)); + sum += sizeof(PtrArray) / sizeof(PtrArray[0]); + // Canonical type of PtrArray1 is same as PtrArray. + sum = sizeof(PtrArray) / sizeof(PtrArray1[0]); + // There is no warning for 'sizeof(T*)/sizeof(Q)' case. + sum += sizeof(PtrArray) / sizeof(A[0]); + return sum; +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp index 003a02209c3d2d..064f31cb08c6b3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp @@ -124,8 +124,6 @@ int Test1(const char* ptr) { // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious usage of sizeof pointer 'sizeof(P*)/sizeof(Q*)' sum += sizeof(ptr) / sizeof(char); // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)' - sum += sizeof(ptr) / sizeof(ptr[0]); - // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)' sum += sizeof(int) * sizeof(char); // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication sum += sizeof(ptr) * sizeof(ptr[0]); @@ -207,50 +205,57 @@ int Test5() { C *PtrArray[10]; C *PC; + char *PChar; + int *PInt, **PPInt; + MyStruct **PPMyStruct; + int sum = 0; sum += sizeof(&S.arr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(&kGlocalMyStruct.arr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(&kGlocalMyStructPtr->arr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(S.arr + 0); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(+ S.arr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof((int*)S.arr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(S.ptr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(kGlocalMyStruct.ptr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(kGlocalMyStructPtr->ptr); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(&kGlocalMyStruct); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(&S); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(MyStruct*); sum += sizeof(PMyStruct); sum += sizeof(PS); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(PS2); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(&A10); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(PtrArray) / sizeof(PtrArray[1]); - // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer sum += sizeof(A10) / sizeof(PtrArray[0]); - // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate sum += sizeof(PC) / sizeof(PtrArray[0]); - // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer // CHECK-MESSAGES: :[[@LINE-2]]:21: warning: suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)' - // CHECK-MESSAGES: :[[@LINE-3]]:23: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate sum += sizeof(ArrayC) / sizeof(PtrArray[0]); // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator - // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate + + // These pointers do not point to aggregate types, so they are not reported in this mode: + sum += sizeof(PChar); + sum += sizeof(PInt); + sum += sizeof(PPInt); + sum += sizeof(PPMyStruct); return sum; } @@ -293,6 +298,32 @@ bool Baz() { return sizeof(A) < N; } // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: suspicious comparison of 'sizeof(expr)' to a constant bool Test7() { return Baz<-1>(); } +void some_generic_function(const void *arg, int argsize); +int *IntP, **IntPP; +C *ClassP, **ClassPP; + +void GenericFunctionTest() { + // The `sizeof(pointer)` checks ignore situations where the pointer is + // produced by dereferencing a pointer-to-pointer, because this is unlikely + // to be an accident and can appear in legitimate code that tries to call + // a generic function which emulates dynamic typing within C. + some_generic_function(IntPP, sizeof(*IntPP)); + some_generic_function(ClassPP, sizeof(*ClassPP)); + // Using `...[0]` instead of the dereference operator is another common + // variant, which is also widespread in the idiomatic array-size calculation: + // `sizeof(array) / sizeof(array[0])`. + some_generic_function(IntPP, sizeof(IntPP[0])); + some_generic_function(ClassPP, sizeof(ClassPP[0])); + // FIXME: There is a third common pattern where the generic function is + // called with `&Variable` and `sizeof(Variable)`. Right now these are + // reported by the `sizeof(pointer)` checks, but this causes some false + // positives, so it would be good to create an exception for them. + // NOTE: `sizeof(IntP)` is only reported with `WarnOnSizeOfPointer=true`. + some_generic_function(&IntPP, sizeof(IntP)); + some_generic_function(&ClassPP, sizeof(ClassP)); + // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer +} + int ValidExpressions() { int A[] = {1, 2, 3, 4}; static const char str[] = "hello"; From d4b8b7217f31827f8536c9340b55ecb21e540621 Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Tue, 11 Jun 2024 14:27:35 +0200 Subject: [PATCH 57/82] [CodeGen][MachineLICM] Use RegUnits in HoistRegionPostRA (#94608) Those BitVectors get expensive on targets like AMDGPU with thousands of registers, and RegAliasIterator is also expensive. We can move all liveness calculations to use RegUnits instead to speed it up for targets where RegAliasIterator is expensive, like AMDGPU. On targets where RegAliasIterator is cheap, this alternative can be a little more expensive, but I believe the tradeoff is worth it. --- llvm/lib/CodeGen/MachineLICM.cpp | 143 +++++++++++++++------- llvm/test/CodeGen/AMDGPU/indirect-call.ll | 4 +- 2 files changed, 100 insertions(+), 47 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 86eb259c090152..9cc6d9b9fa715f 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -223,8 +223,8 @@ namespace { void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop, MachineBasicBlock *CurPreheader); - void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, SmallSet &StoredFIs, + void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers, + SmallSet &StoredFIs, SmallVectorImpl &Candidates, MachineLoop *CurLoop); @@ -423,10 +423,47 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { return false; } +static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, + BitVector &RUs, + const uint32_t *Mask) { + // Iterate over the RegMask raw to avoid constructing a BitVector, which is + // expensive as it implies dynamically allocating memory. + // + // We also work backwards. + const unsigned NumRegs = TRI.getNumRegs(); + const unsigned MaskWords = (NumRegs + 31) / 32; + for (unsigned K = 0; K < MaskWords; ++K) { + // We want to set the bits that aren't in RegMask, so flip it. + uint32_t Word = ~Mask[K]; + + // Iterate all set bits, starting from the right. + while (Word) { + const unsigned SetBitIdx = countr_zero(Word); + + // The bits are numbered from the LSB in each word. + const unsigned PhysReg = (K * 32) + SetBitIdx; + + // Clear the bit at SetBitIdx. Doing it this way appears to generate less + // instructions on x86. This works because negating a number will flip all + // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but + // faster. + Word ^= Word & -Word; + + if (PhysReg == NumRegs) + return; + + if (PhysReg) { + for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) + RUs.set(*RUI); + } + } + } +} + /// Examine the instruction for potentai LICM candidate. Also /// gather register def and frame object update information. -void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, +void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs, + BitVector &RUClobbers, SmallSet &StoredFIs, SmallVectorImpl &Candidates, MachineLoop *CurLoop) { @@ -448,7 +485,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // We can't hoist an instruction defining a physreg that is clobbered in // the loop. if (MO.isRegMask()) { - PhysRegClobbers.setBitsNotInMask(MO.getRegMask()); + applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask()); continue; } @@ -460,16 +497,22 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, assert(Reg.isPhysical() && "Not expecting virtual register!"); if (!MO.isDef()) { - if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg))) - // If it's using a non-loop-invariant register, then it's obviously not - // safe to hoist. - HasNonInvariantUse = true; + if (!HasNonInvariantUse) { + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + // If it's using a non-loop-invariant register, then it's obviously + // not safe to hoist. + if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { + HasNonInvariantUse = true; + break; + } + } + } continue; } if (MO.isImplicit()) { - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - PhysRegClobbers.set(*AI); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) + RUClobbers.set(*RUI); if (!MO.isDead()) // Non-dead implicit def? This cannot be hoisted. RuledOut = true; @@ -488,19 +531,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // If we have already seen another instruction that defines the same // register, then this is not safe. Two defs is indicated by setting a // PhysRegClobbers bit. - for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { - if (PhysRegDefs.test(*AS)) - PhysRegClobbers.set(*AS); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + if (RUDefs.test(*RUI)) { + RUClobbers.set(*RUI); + RuledOut = true; + } else if (RUClobbers.test(*RUI)) { + // MI defined register is seen defined by another instruction in + // the loop, it cannot be a LICM candidate. + RuledOut = true; + } + + RUDefs.set(*RUI); } - // Need a second loop because MCRegAliasIterator can visit the same - // register twice. - for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) - PhysRegDefs.set(*AS); - - if (PhysRegClobbers.test(Reg)) - // MI defined register is seen defined by another instruction in - // the loop, it cannot be a LICM candidate. - RuledOut = true; } // Only consider reloads for now and remats which do not have register @@ -521,9 +563,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, if (!Preheader) return; - unsigned NumRegs = TRI->getNumRegs(); - BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop. - BitVector PhysRegClobbers(NumRegs); // Regs defined more than once. + unsigned NumRegUnits = TRI->getNumRegUnits(); + BitVector RUDefs(NumRegUnits); // RUs defined once in the loop. + BitVector RUClobbers(NumRegUnits); // RUs defined more than once. SmallVector Candidates; SmallSet StoredFIs; @@ -540,22 +582,21 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, // FIXME: That means a reload that're reused in successor block(s) will not // be LICM'ed. for (const auto &LI : BB->liveins()) { - for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) - PhysRegDefs.set(*AI); + for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI) + RUDefs.set(*RUI); } // Funclet entry blocks will clobber all registers if (const uint32_t *Mask = BB->getBeginClobberMask(TRI)) - PhysRegClobbers.setBitsNotInMask(Mask); + applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask); SpeculationState = SpeculateUnknown; for (MachineInstr &MI : *BB) - ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates, - CurLoop); + ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop); } // Gather the registers read / clobbered by the terminator. - BitVector TermRegs(NumRegs); + BitVector TermRUs(NumRegUnits); MachineBasicBlock::iterator TI = Preheader->getFirstTerminator(); if (TI != Preheader->end()) { for (const MachineOperand &MO : TI->operands()) { @@ -564,8 +605,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, Register Reg = MO.getReg(); if (!Reg) continue; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - TermRegs.set(*AI); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) + TermRUs.set(*RUI); } } @@ -583,24 +624,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, continue; unsigned Def = Candidate.Def; - if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { - bool Safe = true; - MachineInstr *MI = Candidate.MI; - for (const MachineOperand &MO : MI->all_uses()) { - if (!MO.getReg()) - continue; - Register Reg = MO.getReg(); - if (PhysRegDefs.test(Reg) || - PhysRegClobbers.test(Reg)) { + bool Safe = true; + for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) { + if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) { + Safe = false; + break; + } + } + + if (!Safe) + continue; + + MachineInstr *MI = Candidate.MI; + for (const MachineOperand &MO : MI->all_uses()) { + if (!MO.getReg()) + continue; + for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) { + if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { // If it's using a non-loop-invariant register, then it's obviously // not safe to hoist. Safe = false; break; } } - if (Safe) - HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); + + if (!Safe) + break; } + + if (Safe) + HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); } } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7799b9509ceb03..da8aa544698355 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 ; GISEL-NEXT: s_mov_b64 s[6:7], exec -; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11] From ffc3a6b286ee619ab8f662cb7174705734eb1ce1 Mon Sep 17 00:00:00 2001 From: Zibi Sarbinowski Date: Tue, 11 Jun 2024 08:29:12 -0400 Subject: [PATCH 58/82] [libc++] Fix endianness for algorithm mismatch (#93082) This PR is required to fix `std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp` test for big endian platrofrms such as z/OS. --- libcxx/include/__algorithm/simd_utils.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h index aa4336a2214c87..549197be80183f 100644 --- a/libcxx/include/__algorithm/simd_utils.h +++ b/libcxx/include/__algorithm/simd_utils.h @@ -11,6 +11,7 @@ #include <__algorithm/min.h> #include <__bit/bit_cast.h> +#include <__bit/countl.h> #include <__bit/countr.h> #include <__config> #include <__type_traits/is_arithmetic.h> @@ -126,8 +127,13 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_T // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876 auto __impl = [&](_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept { +# if defined(_LIBCPP_BIG_ENDIAN) + return std::min( + _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); +# else return std::min( _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec)))); +# endif }; if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) { From 8c5d9c79b96ed8297b381e00d3a706a432cd6c9d Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 13:29:38 +0100 Subject: [PATCH 59/82] Revert "[MLIR][Flang][DebugInfo] Set debug info format in MLIR->IR translation (#95098)" Reverted due to failure on buildbot due to missing use of the WriteNewDbgInfoFormat flag in MLIR. This reverts commit ca920bb6285e9995f5a202d040af79363e98ab28. --- flang/lib/Frontend/FrontendActions.cpp | 9 --------- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 10 ---------- 2 files changed, 19 deletions(-) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index a4db944e8c0abd..b1b6391f1439c6 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -50,7 +50,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeWriterPass.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" -#include "llvm/IR/DebugProgramInstruction.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" @@ -82,8 +81,6 @@ using namespace Fortran::frontend; llvm::PassPluginLibraryInfo get##Ext##PluginInfo(); #include "llvm/Support/Extension.def" -extern llvm::cl::opt WriteNewDbgInfoFormat; - /// Save the given \c mlirModule to a temporary .mlir file, in a location /// decided by the -save-temps flag. No files are produced if the flag is not /// specified. @@ -1274,12 +1271,6 @@ void CodeGenAction::executeAction() { runOptimizationPipeline(ci.isOutputStreamNull() ? *os : ci.getOutputStream()); if (action == BackendActionTy::Backend_EmitLL) { - // When printing LLVM IR, we should convert the module to the debug info - // format that LLVM expects us to print. - llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, - WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) - llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(ci.isOutputStreamNull() ? *os : ci.getOutputStream(), /*AssemblyAnnotationWriter=*/nullptr); return; diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index e1a60f195fe89c..7b86b250c294b4 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -64,8 +64,6 @@ using namespace mlir; using namespace mlir::LLVM; using namespace mlir::LLVM::detail; -extern llvm::cl::opt UseNewDbgInfoFormat; - #include "mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc" namespace { @@ -1791,9 +1789,6 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext, StringRef name) { m->getContext()->getOrLoadDialect(); auto llvmModule = std::make_unique(name, llvmContext); - // ModuleTranslation can currently only construct modules in the old debug - // info format, so set the flag accordingly. - llvmModule->setNewDbgInfoFormatFlag(false); if (auto dataLayoutAttr = m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) { llvmModule->setDataLayout(cast(dataLayoutAttr).getValue()); @@ -1872,11 +1867,6 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, if (failed(translator.convertFunctions())) return nullptr; - // Once we've finished constructing elements in the module, we should convert - // it to use the debug info format desired by LLVM. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - translator.llvmModule->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); - if (!disableVerification && llvm::verifyModule(*translator.llvmModule, &llvm::errs())) return nullptr; From a45080f09181517c9c5eb5099a6b6ac67a48424a Mon Sep 17 00:00:00 2001 From: Pierre van Houtryve Date: Tue, 11 Jun 2024 14:31:26 +0200 Subject: [PATCH 60/82] [AMDGPU] Document amdgpu-as in AMDGPUUsage (#94335) Add a section about fence & address spaces that covers amdgpu-as. --- llvm/docs/AMDGPUUsage.rst | 411 ++++++++++---------------------------- 1 file changed, 105 insertions(+), 306 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index aa50ce329d1dea..b7ec1b51ee247e 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5980,6 +5980,33 @@ following sections: * :ref:`amdgpu-amdhsa-memory-model-gfx942` * :ref:`amdgpu-amdhsa-memory-model-gfx10-gfx11` +.. _amdgpu-fence-as: + +Fence and Address Spaces +++++++++++++++++++++++++++++++ + +LLVM fences do not have address space information, thus, fence +codegen usually needs to conservatively synchronize all address spaces. + +In the case of OpenCL, where fences only need to synchronize +user-specified address spaces, this can result in extra unnecessary waits. +For instance, a fence that is supposed to only synchronize local memory will +also have to wait on all global memory operations, which is unnecessary. + +:doc:`Memory Model Relaxation Annotations ` can +be used as an optimization hint for fences to solve this problem. +The AMDGPU backend recognizes the following tags on fences: + +- ``amdgpu-as:local`` - fence only the local address space +- ``amdgpu-as:global``- fence only the global address space + +.. note:: + + As an optimization hint, those tags are not guaranteed to survive until + code generation. Optimizations are free to drop the tags to allow for + better code optimization, at the cost of synchronizing additional address + spaces. + .. _amdgpu-amdhsa-memory-model-gfx6-gfx9: Memory Model GFX6-GFX9 @@ -6317,21 +6344,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. - If OpenCL and address space is not generic, omit. - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Must happen after any preceding local/generic load @@ -6363,14 +6378,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -6573,21 +6583,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. - If OpenCL and address space is not generic, omit. - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Must happen after any preceding local/generic @@ -6623,21 +6621,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -6967,14 +6953,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`. address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -7915,21 +7896,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - s_waitcnt vmcnt(0) must happen after any preceding @@ -7988,14 +7957,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -8066,14 +8030,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -8441,21 +8400,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - s_waitcnt vmcnt(0) must happen after any preceding @@ -8501,21 +8448,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -8583,21 +8518,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -9218,14 +9141,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -9327,14 +9245,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -10290,21 +10203,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - s_waitcnt vmcnt(0) must happen after any preceding @@ -10363,14 +10264,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -10441,14 +10337,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -10847,21 +10738,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - s_waitcnt vmcnt(0) must happen after any preceding @@ -10920,21 +10799,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -10999,21 +10866,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is local, omit vmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -11662,14 +11517,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -11771,14 +11621,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 address space is not generic, omit lgkmcnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0) and @@ -12624,21 +12469,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`. address space is local, omit vmcnt(0) and vscnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0), s_waitcnt @@ -12721,14 +12554,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`. address space is local, omit vmcnt(0) and vscnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0), s_waitcnt @@ -13092,21 +12920,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`. address space is local, omit vmcnt(0) and vscnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0), s_waitcnt @@ -13165,21 +12981,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`. address space is local, omit vmcnt(0) and vscnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate. If - fence had an - address space then - set to address - space of OpenCL - fence flag, or to - generic if both - local and global - flags are - specified. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0), s_waitcnt @@ -13731,14 +13535,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`. address space is local, omit vmcnt(0) and vscnt(0). - - However, since LLVM - currently has no - address space on - the fence need to - conservatively - always generate - (see comment for - previous fence). + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. - Could be split into separate s_waitcnt vmcnt(0), s_waitcnt From a2bc50aa8b7192986802e9568a1ed71a894e16e2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 7 Jun 2024 20:22:39 +0200 Subject: [PATCH 61/82] AMDGPU: Add more tests for vector typed atomicrmw fadd Some cases should be legal for gfx940. --- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 161 + .../GlobalISel/irtranslator-atomicrmw.ll | 162 +- .../CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll | 102 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 5084 ++++++++++++++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 199 + llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 5950 ++++++++++++++++- llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll | 2466 +++++-- .../AMDGPU/expand-atomic-rmw-fadd.ll | 187 +- 8 files changed, 13542 insertions(+), 769 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index fab94875516973..93c30a6a01e002 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -209,4 +209,165 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ret <2 x i16> %ret } +define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { +; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 + %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define void @local_atomic_fadd_noret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) { +; GFX940-LABEL: local_atomic_fadd_noret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 offset:65532 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383 + %unused = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst + ret void +} + +define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) { +; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i32 256 + %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) { +; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i32 256 + %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll index 8262cfd34823ff..5724cf471bae36 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -O0 -stop-after=irtranslator -o - %s | FileCheck %s define float @test_atomicrmw_fadd(ptr addrspace(3) %addr) { ; CHECK-LABEL: name: test_atomicrmw_fadd @@ -34,14 +34,14 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) { ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %13(s32), %bb.2 ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]] ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) - ; CHECK-NEXT: [[INTRINSIC:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) - ; CHECK-NEXT: [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC]](s64) - ; CHECK-NEXT: G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.3 + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) + ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 ; CHECK-NEXT: G_BR %bb.2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.atomicrmw.end: ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32), %bb.2 - ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INTRINSIC]](s64), %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0 @@ -49,5 +49,157 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) { ret float %oldval } +define <2 x half> @test_atomicrmw_fadd_vector(ptr addrspace(3) %addr) { + ; CHECK-LABEL: name: test_atomicrmw_fadd_vector + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3) + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.atomicrmw.start: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[PHI1]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FADD]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) + ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) + ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.atomicrmw.end: + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %oldval = atomicrmw fadd ptr addrspace(3) %addr, <2 x half> seq_cst + ret <2 x half> %oldval +} + +define <2 x half> @test_atomicrmw_fsub_vector(ptr addrspace(3) %addr) { + ; CHECK-LABEL: name: test_atomicrmw_fsub_vector + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3) + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.atomicrmw.start: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[FSUB:%[0-9]+]]:_(<2 x s16>) = G_FSUB [[PHI1]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FSUB]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) + ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) + ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.atomicrmw.end: + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %oldval = atomicrmw fsub ptr addrspace(3) %addr, <2 x half> seq_cst + ret <2 x half> %oldval +} + +define <2 x half> @test_atomicrmw_fmin_vector(ptr addrspace(3) %addr) { + ; CHECK-LABEL: name: test_atomicrmw_fmin_vector + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3) + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.atomicrmw.start: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM [[PHI1]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) + ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) + ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.atomicrmw.end: + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %oldval = atomicrmw fmin ptr addrspace(3) %addr, <2 x half> seq_cst + ret <2 x half> %oldval +} + +define <2 x half> @test_atomicrmw_fmax_vector(ptr addrspace(3) %addr) { + ; CHECK-LABEL: name: test_atomicrmw_fmax_vector + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3) + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.atomicrmw.start: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2 + ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM [[PHI1]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM]](<2 x s16>) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>) + ; CHECK-NEXT: [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3) + ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32) + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64) + ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64) + ; CHECK-NEXT: G_BRCOND [[INT1]](s1), %bb.3 + ; CHECK-NEXT: G_BR %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.atomicrmw.end: + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[PHI2]](<2 x s16>) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %oldval = atomicrmw fmax ptr addrspace(3) %addr, <2 x half> seq_cst + ret <2 x half> %oldval +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll index 376fe79f542e36..647c5b568b7ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX12 %s define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) { ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic @@ -13,6 +14,18 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x ha ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] ; GFX940-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; GFX940-NEXT: S_ENDPGM 0 + ; + ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX12-NEXT: FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX12-NEXT: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) ret void } @@ -30,84 +43,21 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x ; GFX940-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + ; + ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic + ; GFX12: bb.0 (%ir-block.0): + ; GFX12-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX12-NEXT: [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX12-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]] + ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data) ret <2 x half> %ret } declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>) - -define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { - ; GFX940-LABEL: name: flat_agent_atomic_fadd_ret_v2f16 - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: successors: %bb.1(0x80000000) - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) - ; GFX940-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.1.atomicrmw.start: - ; GFX940-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 - ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 - ; GFX940-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX940-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr) - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec - ; GFX940-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc - ; GFX940-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.2.atomicrmw.end: - ; GFX940-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.1 - ; GFX940-NEXT: [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX940-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: $vgpr0 = COPY [[PHI2]] - ; GFX940-NEXT: SI_RETURN implicit $vgpr0 - %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst - ret <2 x half> %result -} - -define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { - ; GFX940-LABEL: name: flat_agent_atomic_fadd_noret_v2f16 - ; GFX940: bb.0 (%ir-block.0): - ; GFX940-NEXT: successors: %bb.1(0x80000000) - ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] - ; GFX940-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) - ; GFX940-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.1.atomicrmw.start: - ; GFX940-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1 - ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 - ; GFX940-NEXT: [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1 - ; GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] - ; GFX940-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr) - ; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec - ; GFX940-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc - ; GFX940-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: S_BRANCH %bb.2 - ; GFX940-NEXT: {{ $}} - ; GFX940-NEXT: bb.2.atomicrmw.end: - ; GFX940-NEXT: [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX940-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX940-NEXT: SI_RETURN - %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll new file mode 100644 index 00000000000000..1e7cf0e702a03f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -0,0 +1,5084 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX90,GFX908 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90,GFX90A %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB0_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB0_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB0_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB0_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB1_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB1_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB1_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB2_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB2_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB2_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB2_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX90-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX90: ; %bb.0: +; GFX90-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90-NEXT: flat_load_dword v3, v[0:1] +; GFX90-NEXT: s_mov_b64 s[6:7], 0 +; GFX90-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90-NEXT: s_movk_i32 s8, 0x7fff +; GFX90-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX90-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90-NEXT: buffer_wbinvl1_vol +; GFX90-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90-NEXT: v_mov_b32_e32 v3, v2 +; GFX90-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90-NEXT: s_cbranch_execnz .LBB3_1 +; GFX90-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB3_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB3_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + +define <2 x half> @flat_system_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB4_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define void @flat_system_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v4, v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v5, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v5, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB5_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB5_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x half> %val seq_cst + ret void +} + +define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v4 +; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val seq_cst + ret <2 x bfloat> %result +} + +define void @flat_system_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dword v3, v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v3, v[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB7_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b32 v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB7_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val seq_cst + ret void +} + +define <4 x half> @flat_agent_atomic_fadd_ret_v4f16(ptr %ptr, <4 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; GFX7-NEXT: flat_load_dword v9, v[0:1] +; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v10, v[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_add_f32_e32 v11, v11, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-NEXT: v_add_f32_e32 v10, v10, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[10:11], v[0:1], v[2:5] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_f16_sdwa v4, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v7, v3 +; GFX8-NEXT: v_add_f16_sdwa v8, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v9, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v9, v8 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX908-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX940-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX10-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX11-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX12-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <4 x half> %val syncscope("agent") seq_cst + ret <4 x half> %result +} + +define <4 x half> @flat_system_atomic_fadd_ret_v4f16(ptr %ptr, <4 x half> %val) { +; GFX7-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v6, vcc, 4, v0 +; GFX7-NEXT: flat_load_dword v9, v[0:1] +; GFX7-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v10, v[6:7] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v8, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v9, v11 +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v2, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_add_f32_e32 v11, v11, v6 +; GFX7-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX7-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GFX7-NEXT: v_add_f32_e32 v10, v10, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_add_f32_e32 v2, v2, v9 +; GFX7-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GFX7-NEXT: v_cvt_f16_f32_e32 v12, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[10:11], v[0:1], v[2:5] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v10 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_f16_sdwa v4, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v7, v3 +; GFX8-NEXT: v_add_f16_sdwa v8, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v9, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 +; GFX8-NEXT: v_or_b32_e32 v4, v9, v8 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v7, v5 +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX908-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, v5 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX940-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX10-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX11-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_system_atomic_fadd_ret_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_add_f16 v5, v7, v3 +; GFX12-NEXT: v_pk_add_f16 v4, v6, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr %ptr, <4 x half> %val seq_cst + ret <4 x half> %result +} + +define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_offset(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x3ffc, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x3ffc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB10_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x3ffc, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: flat_load_dword v0, v[0:1] offset:4092 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x3ffc, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:4092 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX940-NEXT: flat_load_dword v0, v[0:1] offset:4092 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x3ffc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: flat_load_dword v0, v[3:4] +; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB10_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x3ffc, v3 +; GFX11-NEXT: flat_load_b32 v0, v[4:5] offset:4092 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB10_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16380 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 4095 + %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_negoffset(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v1, v[4:5] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v1, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB11_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: flat_load_dword v0, v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX940-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: flat_load_dword v0, v[3:4] +; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB11_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB11_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result +} + +define void @flat_agent_atomic_fadd_noret_v2f16_offset(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3ffc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ffc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB12_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x3ffc, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: flat_load_dword v1, v[0:1] offset:4092 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB12_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x3ffc, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:4092 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0x3000, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX940-NEXT: flat_load_dword v5, v[4:5] offset:4092 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3ffc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB12_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x3000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3ffc, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v4, v[3:4] offset:4092 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB12_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:16380 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 4095 + %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define void @flat_agent_atomic_fadd_noret_v2f16_negoffset(ptr %ptr, <2 x half> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 +; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 +; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB13_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_add_f16_e32 v5, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB13_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: flat_load_dword v1, v[0:1] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB13_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX940-NEXT: flat_load_dword v5, v[4:5] +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB13_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v4, v[3:4] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_offset(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x3ffc, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x3ffc, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB14_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x3ffc, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: flat_load_dword v0, v[0:1] offset:4092 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x3ffc, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:4092 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v4 +; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX940-NEXT: flat_load_dword v0, v[0:1] offset:4092 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x3ffc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: flat_load_dword v0, v[3:4] +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x3ffc, v3 +; GFX11-NEXT: flat_load_b32 v0, v[4:5] offset:4092 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16380 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 4095 + %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_negoffset(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB15_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: flat_load_dword v0, v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v0 +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v0 +; GFX940-NEXT: v_mov_b32_e32 v5, v1 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc +; GFX940-NEXT: flat_load_dword v0, v[0:1] +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 +; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: flat_load_dword v0, v[3:4] +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 +; GFX11-NEXT: flat_load_b32 v0, v[4:5] +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define void @flat_agent_atomic_fadd_noret_v2bf16_offset(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x3ffc, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB16_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ffc, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB16_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x3ffc, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX908-NEXT: flat_load_dword v1, v[0:1] offset:4092 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x3ffc, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:4092 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0x3000, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX940-NEXT: flat_load_dword v3, v[4:5] offset:4092 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3ffc, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x3000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3ffc, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v3, v[3:4] offset:4092 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16380 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 4095 + %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + +define void @flat_agent_atomic_fadd_noret_v2bf16_negoffset(ptr %ptr, <2 x bfloat> %val) { +; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX7-NEXT: flat_load_dword v5, v[0:1] +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_add_f32_e32 v7, v7, v3 +; GFX7-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: s_mov_b64 s[6:7], 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX8-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 +; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_cbranch_execnz .LBB17_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX908-NEXT: flat_load_dword v1, v[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX908-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX90A-NEXT: flat_load_dword v1, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 +; GFX940-NEXT: s_movk_i32 s0, 0xf800 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc +; GFX940-NEXT: flat_load_dword v3, v[4:5] +; GFX940-NEXT: s_mov_b32 s1, -1 +; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v3, v[3:4] +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 50a27d42322d7a..16ddf91ebf8f02 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) @@ -20,6 +21,15 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_fadd_f32_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX12-NEXT: s_endpgm %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) ret void } @@ -36,6 +46,29 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB1_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret void } @@ -52,6 +85,29 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] +; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB2_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_endpgm %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret void } @@ -63,6 +119,17 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) { ; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) ret float %ret } @@ -77,6 +144,35 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_load_b32 v2, v[0:1] +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst ret float %ret } @@ -91,6 +187,15 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) ret void } @@ -102,6 +207,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { ; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) ret <2 x half> %ret } @@ -116,6 +232,15 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) ret void } @@ -127,6 +252,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { ; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) ret <2 x i16> %ret } @@ -141,6 +277,16 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret void } @@ -152,6 +298,17 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> ; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) ret <2 x i16> %ret } @@ -165,6 +322,14 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, ; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: local_atomic_fadd_v2f16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret void } @@ -176,6 +341,17 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret } @@ -192,6 +368,16 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ds_pk_add_bf16 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret void } @@ -205,6 +391,19 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) ret <2 x i16> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index d10e049444d685..88ce07c9779d0a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -2,8 +2,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -131,6 +133,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_ret_f32: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: ; implicit-def: $vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_cbranch_execz .LBB0_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB0_2: +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX940-NEXT: v_readfirstlane_b32 s0, v1 +; GFX940-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX940-NEXT: global_store_dword v[0:1], v0, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo @@ -214,6 +246,48 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB0_4 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s5, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX12-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX12-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_cbranch_execnz .LBB0_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: .LBB0_4: ; %Flow1 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX12-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst store float %result, ptr addrspace(1) undef ret void @@ -332,6 +406,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: ; implicit-def: $vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_cbranch_execz .LBB1_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB1_2: +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX940-NEXT: v_readfirstlane_b32 s0, v1 +; GFX940-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX940-NEXT: global_store_dword v[0:1], v0, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo @@ -402,6 +506,34 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB1_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: .LBB1_2: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX12-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst store float %result, ptr addrspace(1) undef ret void @@ -483,6 +615,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX90A-NEXT: .LBB2_2: ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_noret_f32: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB2_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB2_2: +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_noret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s3, exec_lo @@ -535,6 +689,25 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) # ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB2_2: ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_noret_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB2_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: .LBB2_2: +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -615,6 +788,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX90A-NEXT: .LBB3_2: ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB3_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB3_2: +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s3, exec_lo @@ -667,6 +862,25 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: .LBB3_2: ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB3_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v1, v0, s[0:1] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: .LBB3_2: +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -781,6 +995,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: ; implicit-def: $vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_cbranch_execz .LBB4_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB4_2: +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX940-NEXT: v_readfirstlane_b32 s0, v1 +; GFX940-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX940-NEXT: global_store_dword v[0:1], v0, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo @@ -850,6 +1094,34 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB4_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: .LBB4_2: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX12-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst store float %result, ptr addrspace(1) undef ret void @@ -981,6 +1253,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_ret_f32_system: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: ; implicit-def: $vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX940-NEXT: s_cbranch_execz .LBB5_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB5_2: +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX940-NEXT: v_readfirstlane_b32 s0, v1 +; GFX940-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX940-NEXT: global_store_dword v[0:1], v0, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_ret_f32_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s4, exec_lo @@ -1064,6 +1366,48 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_f32_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB5_4 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s4, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s5, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX12-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX12-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_cbranch_execnz .LBB5_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: .LBB5_4: ; %Flow1 +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX12-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, ptr addrspace(1) undef ret void @@ -1151,6 +1495,47 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp ; GFX11-NEXT: v_mad_f32 v0, v0, 4.0, s0 ; GFX11-NEXT: global_store_dword v[0:1], v0, off ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b64 s[4:5], exec +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX12-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX12-NEXT: s_cbranch_execz .LBB6_4 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b64 s7, s[4:5] +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s7 +; GFX12-NEXT: s_mov_b64 s[4:5], 0 +; GFX12-NEXT: v_mul_f32_e32 v2, 4.0, v1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_mov_b32_e32 v5, v1 +; GFX12-NEXT: v_add_f32_e32 v4, v5, v2 +; GFX12-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_wbinvl1_vol +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX12-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX12-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX12-NEXT: s_cbranch_execnz .LBB6_2 +; GFX12-NEXT: ; %bb.3: ; %Flow +; GFX12-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX12-NEXT: .LBB6_4: ; %Flow1 +; GFX12-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX12-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: v_mad_f32 v0, v0, 4.0, s0 +; GFX12-NEXT: global_store_dword v[0:1], v0, off +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst store float %result, ptr addrspace(1) undef ret void @@ -1198,6 +1583,27 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr ; GFX11-NEXT: buffer_wbinvl1_vol ; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b64 s[2:3], exec +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX12-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX12-NEXT: s_cbranch_execz .LBB7_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: buffer_wbinvl1_vol +; GFX12-NEXT: .LBB7_2: +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -1302,6 +1708,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX90A-NEXT: .LBB8_3: ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_noret_f32_safe: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB8_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB8_2: +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s3, exec_lo @@ -1366,6 +1794,37 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p ; GFX11-NEXT: s_cbranch_execnz .LBB8_2 ; GFX11-NEXT: .LBB8_3: ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_noret_f32_safe: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB8_3 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_execnz .LBB8_2 +; GFX12-NEXT: .LBB8_3: +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -1444,6 +1903,26 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX90A-NEXT: .LBB9_2: ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: infer_as_before_atomic: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB9_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX940-NEXT: .LBB9_2: +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: infer_as_before_atomic: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_mov_b32 s3, exec_lo @@ -1496,6 +1975,27 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: infer_as_before_atomic: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_mov_b32 s2, exec_lo +; GFX12-NEXT: s_mov_b32 s3, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB9_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX12-NEXT: .LBB9_2: +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %load = load ptr, ptr addrspace(4) %arg %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void @@ -1625,6 +2125,50 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX90A-NEXT: global_store_short v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_ret_bf16_agent: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_and_b32 s2, s6, -4 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX940-NEXT: s_and_b32 s5, s6, 3 +; GFX940-NEXT: s_lshl_b32 s5, s5, 3 +; GFX940-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX940-NEXT: s_not_b32 s6, s6 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX940-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s6, v1 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX940-NEXT: global_store_short v[0:1], v0, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_ret_bf16_agent: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -1711,6 +2255,50 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_bf16_agent: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s0, s2, -4 +; GFX12-NEXT: s_mov_b32 s1, s3 +; GFX12-NEXT: s_and_b32 s2, s2, 3 +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX12-NEXT: s_lshl_b32 s2, s2, 3 +; GFX12-NEXT: s_lshl_b32 s4, 0xffff, s2 +; GFX12-NEXT: s_not_b32 s4, s4 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s2, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX12-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX12-NEXT: global_store_b16 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat 4.0 syncscope("agent") seq_cst store bfloat %result, ptr addrspace(1) undef ret void @@ -1842,6 +2430,50 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX90A-NEXT: global_store_short v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; +; GFX940-LABEL: global_atomic_fadd_ret_bf16_system: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_and_b32 s2, s6, -4 +; GFX940-NEXT: s_mov_b32 s3, s7 +; GFX940-NEXT: s_load_dword s7, s[2:3], 0x0 +; GFX940-NEXT: s_and_b32 s5, s6, 3 +; GFX940-NEXT: s_lshl_b32 s5, s5, 3 +; GFX940-NEXT: s_lshl_b32 s6, 0xffff, s5 +; GFX940-NEXT: s_not_b32 s6, s6 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: v_mov_b32_e32 v3, v1 +; GFX940-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX940-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX940-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v2, v3, s6, v1 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[2:3] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, s5, v1 +; GFX940-NEXT: global_store_short v[0:1], v0, off sc0 sc1 +; GFX940-NEXT: s_endpgm +; ; GFX10-LABEL: global_atomic_fadd_ret_bf16_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -1928,13 +2560,57 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) % ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: global_atomic_fadd_ret_bf16_system: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s0, s2, -4 +; GFX12-NEXT: s_mov_b32 s1, s3 +; GFX12-NEXT: s_and_b32 s2, s2, 3 +; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX12-NEXT: s_lshl_b32 s2, s2, 3 +; GFX12-NEXT: s_lshl_b32 s4, 0xffff, s2 +; GFX12-NEXT: s_not_b32 s4, s4 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, s3 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: v_lshrrev_b32_e32 v1, s2, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-NEXT: v_add_f32_e32 v1, 4.0, v1 +; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, s2, v1 +; GFX12-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX12-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_or_b32 s3, vcc_lo, s3 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX12-NEXT: global_store_b16 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat 4.0 syncscope("one-as") seq_cst store bfloat %result, ptr addrspace(1) undef ret void } -define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { -; GFX900-LABEL: global_atomic_fadd_ret_v2f16: +define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_dword v3, v[0:1], off @@ -1956,7 +2632,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_atomic_fadd_ret_v2f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_load_dword v3, v[0:1], off @@ -1978,7 +2654,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_atomic_fadd_ret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_load_dword v3, v[0:1], off @@ -2000,7 +2676,30 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_atomic_fadd_ret_v2f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB12_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v3, v[0:1], off @@ -2024,7 +2723,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_atomic_fadd_ret_v2f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v3, v[0:1], off @@ -2047,380 +2746,4819 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB12_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } -define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { -; GFX900-LABEL: global_atomic_fadd_noret_v2f16: +define <2 x half> @global_agent_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_load_dword v4, v[0:1], off +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX900-NEXT: s_mov_b64 s[4:5], 0 ; GFX900-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX900-NEXT: v_mov_b32_e32 v4, v3 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX900-NEXT: s_cbranch_execnz .LBB13_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_atomic_fadd_noret_v2f16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX908-NEXT: v_mov_b32_e32 v4, v3 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB13_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_atomic_fadd_noret_v2f16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_atomic_fadd_noret_v2f16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB13_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, v3 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_atomic_fadd_noret_v2f16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:1024 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst - ret void +; +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB13_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { -; GFX900-LABEL: global_atomic_fadd_ret_v2bf16: +define <2 x half> @global_agent_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_load_dword v3, v[0:1], off -; GFX900-NEXT: s_mov_b64 s[6:7], 0 -; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX900-NEXT: s_movk_i32 s8, 0x7fff -; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 ; GFX900-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v6, v3 -; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX900-NEXT: s_cbranch_execnz .LBB14_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off -; GFX908-NEXT: s_mov_b64 s[6:7], 0 -; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX908-NEXT: s_movk_i32 s8, 0x7fff -; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 ; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v6, v3 -; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 -; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] -; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 -; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX908-NEXT: s_cbranch_execnz .LBB14_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX908-NEXT: v_mov_b32_e32 v0, v3 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[6:7], 0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX90A-NEXT: s_movk_i32 s8, 0x7fff -; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 ; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v7, v3 -; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 -; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 -; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 -; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] -; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 -; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 -; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_atomic_fadd_ret_v2bf16: +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 -; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst - ret <2 x bfloat> %result +; +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret <2 x half> %result } -define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { -; GFX900-LABEL: global_atomic_fadd_noret_v2bf16: +define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_load_dword v3, v[0:1], off -; GFX900-NEXT: s_mov_b64 s[6:7], 0 -; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX900-NEXT: s_movk_i32 s8, 0x7fff -; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: global_load_dword v4, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 ; GFX900-NEXT: .LBB15_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 -; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 -; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] -; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB15_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define void @global_agent_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB16_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB16_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:1024 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB16_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define void @global_agent_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB17_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-1024 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-1024 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst + ret void +} + +define <2 x half> @global_system_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB18_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB18_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB18_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB18_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB18_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define <2 x half> @global_system_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB19_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB19_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB19_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define <2 x half> @global_system_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB20_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB20_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB20_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB20_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB20_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + ret <2 x half> %result +} + +define void @global_system_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB21_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB21_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB21_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB21_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB21_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val seq_cst + ret void +} + +define void @global_system_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB22_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB22_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB22_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:1024 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:1024 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + ret void +} + +define void @global_system_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) { +; GFX900-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v4, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[4:5], 0 +; GFX900-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v4, v3 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_cbranch_execnz .LBB23_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB23_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v5, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB23_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-1024 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, v3 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB23_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-1024 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB23_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-1024 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB23_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst + ret void +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB24_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB24_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB24_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB24_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB25_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB25_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB25_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB25_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB25_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB25_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB26_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB26_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB26_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB26_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB26_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB26_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret <2 x bfloat> %result +} + +define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB27_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB27_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB27_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + +define void @global_agent_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB28_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB28_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB28_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB28_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB28_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB28_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + +define void @global_agent_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB29_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB29_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB29_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB29_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB29_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB29_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB30_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB30_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB30_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val seq_cst + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB31_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB31_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB31_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB31_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB31_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB31_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + ret <2 x bfloat> %result +} + +define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX900-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX900-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB32_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v3 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX908-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 +; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB32_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v7, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB32_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v6, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB32_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v6, v3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB32_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v6, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo +; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB32_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + ret <2 x bfloat> %result +} + +define void @global_system_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB33_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB33_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB33_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB33_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB33_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB33_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val seq_cst + ret void +} + +define void @global_system_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX900-NEXT: v_mov_b32_e32 v3, v2 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX900-NEXT: s_cbranch_execnz .LBB15_1 +; GFX900-NEXT: s_cbranch_execnz .LBB34_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX908-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_load_dword v3, v[0:1], off +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX908-NEXT: s_mov_b64 s[6:7], 0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX908-NEXT: s_movk_i32 s8, 0x7fff ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX908-NEXT: s_mov_b32 s9, 0x7060302 -; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -2438,28 +7576,28 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX908-NEXT: v_mov_b32_e32 v3, v2 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: s_cbranch_execnz .LBB34_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; -; GFX90A-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -2477,26 +7615,69 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] ; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc ; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 -; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB34_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:1024 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -2515,7 +7696,7 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 ; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv @@ -2523,21 +7704,21 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 -; GFX10-NEXT: s_cbranch_execnz .LBB15_1 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: global_atomic_fadd_noret_v2bf16: +; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:1024 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 -; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 @@ -2556,7 +7737,7 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv @@ -2564,12 +7745,343 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> ; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX11-NEXT: s_cbranch_execnz .LBB15_1 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst +; +; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB34_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst + ret void +} + +define void @global_system_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) { +; GFX900-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX900-NEXT: s_mov_b64 s[6:7], 0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX900-NEXT: s_mov_b32 s9, 0x7060302 +; GFX900-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX900-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX900-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX900-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX900-NEXT: v_mov_b32_e32 v3, v2 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_cbranch_execnz .LBB35_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB35_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 +; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB35_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-1024 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX10-NEXT: s_mov_b32 s5, 0 +; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX10-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 +; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_cbranch_execnz .LBB35_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX11-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_cbranch_execnz .LBB35_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 +; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB35_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 + %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst ret void } diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll index 89abdb2b754a44..4373b76070e32a 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX940 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s @@ -22,6 +25,20 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: lds_atomic_fadd_ret_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX7-LABEL: lds_atomic_fadd_ret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -87,6 +104,20 @@ define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: lds_atomic_fadd_noret_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_add_f32 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX7-LABEL: lds_atomic_fadd_noret_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -215,84 +246,329 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) % ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: lds_ds_fadd: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s3, s3, 4 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_lshl_b32 s8, s3, 3 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 -; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: ds_add_f32 v2, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_add_f32_e32 v2, s8, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: .LBB2_5: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s3, s[4:5] -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_readlane_b32 s9, v2, s3 -; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: v_writelane_b32 v0, s8, m0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: v_add_f32_e32 v1, s9, v1 -; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 -; GFX9-NEXT: ; %bb.6: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB2_8 -; GFX9-NEXT: ; %bb.7: -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_8: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX908-LABEL: lds_ds_fadd: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_add_i32 s3, s3, 4 +; GFX908-NEXT: ; implicit-def: $vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX908-NEXT: s_cbranch_execz .LBB2_2 +; GFX908-NEXT: ; %bb.1: +; GFX908-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX908-NEXT: s_lshl_b32 s8, s3, 3 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: .LBB2_2: +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB2_4 +; GFX908-NEXT: ; %bb.3: +; GFX908-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX908-NEXT: s_lshl_b32 s3, s3, 4 +; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: ds_add_f32 v2, v1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: .LBB2_4: +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX908-NEXT: ; implicit-def: $vgpr0 +; GFX908-NEXT: .LBB2_5: ; %ComputeLoop +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_ff1_i32_b64 s3, s[4:5] +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX908-NEXT: v_writelane_b32 v0, s8, m0 +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX908-NEXT: ; %bb.6: ; %ComputeEnd +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX908-NEXT: ; implicit-def: $vgpr2 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB2_8 +; GFX908-NEXT: ; %bb.7: +; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: .LBB2_8: +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: v_readfirstlane_b32 s2, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_store_dword v1, v0, s[0:1] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: lds_ds_fadd: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_add_i32 s3, s3, 4 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB2_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: s_lshl_b32 s8, s3, 3 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB2_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB2_4 +; GFX90A-NEXT: ; %bb.3: +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX90A-NEXT: s_lshl_b32 s3, s3, 4 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s3 +; GFX90A-NEXT: ds_add_f32 v2, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB2_4: +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX90A-NEXT: ; implicit-def: $vgpr0 +; GFX90A-NEXT: .LBB2_5: ; %ComputeLoop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_ff1_i32_b64 s3, s[4:5] +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB2_8 +; GFX90A-NEXT: ; %bb.7: +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB2_8: +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: lds_ds_fadd: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_add_i32 s3, s3, 4 +; GFX940-NEXT: ; implicit-def: $vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX940-NEXT: s_cbranch_execz .LBB2_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX940-NEXT: s_lshl_b32 s8, s3, 3 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB2_2: +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_mov_b64 s[6:7], exec +; GFX940-NEXT: v_readfirstlane_b32 s8, v1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB2_4 +; GFX940-NEXT: ; %bb.3: +; GFX940-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX940-NEXT: s_lshl_b32 s3, s3, 4 +; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-NEXT: ds_add_f32 v2, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB2_4: +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX940-NEXT: ; implicit-def: $vgpr0 +; GFX940-NEXT: .LBB2_5: ; %ComputeLoop +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_ff1_i32_b64 s3, s[4:5] +; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX940-NEXT: v_readfirstlane_b32 s8, v1 +; GFX940-NEXT: v_readlane_b32 s9, v2, s3 +; GFX940-NEXT: s_mov_b32 m0, s3 +; GFX940-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX940-NEXT: v_writelane_b32 v0, s8, m0 +; GFX940-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX940-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX940-NEXT: ; %bb.6: ; %ComputeEnd +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX940-NEXT: ; implicit-def: $vgpr2 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX940-NEXT: s_cbranch_execz .LBB2_8 +; GFX940-NEXT: ; %bb.7: +; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB2_8: +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_readfirstlane_b32 s2, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_store_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: lds_ds_fadd: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX12-NEXT: s_mov_b32 s5, exec_lo +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s3, s3, 4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB2_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 +; GFX12-NEXT: s_lshl_b32 s5, s3, 3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: .LBB2_2: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s6, exec_lo +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX12-NEXT: s_cbranch_execz .LBB2_4 +; GFX12-NEXT: ; %bb.3: +; GFX12-NEXT: s_bcnt1_i32_b32 s6, s6 +; GFX12-NEXT: s_lshl_b32 s3, s3, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: ds_add_f32 v2, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: .LBB2_4: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: s_brev_b32 s3, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX12-NEXT: v_add_f32_e32 v1, s5, v0 +; GFX12-NEXT: ; implicit-def: $vgpr0 +; GFX12-NEXT: .LBB2_5: ; %ComputeLoop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_ctz_i32_b32 s5, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_readlane_b32 s6, v1, s5 +; GFX12-NEXT: s_lshl_b32 s7, 1, s5 +; GFX12-NEXT: v_writelane_b32 v0, s3, s5 +; GFX12-NEXT: s_and_not1_b32 s4, s4, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s4, 0 +; GFX12-NEXT: s_add_f32 s3, s3, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX12-NEXT: ; %bb.6: ; %ComputeEnd +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX12-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execz .LBB2_8 +; GFX12-NEXT: ; %bb.7: +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: .LBB2_8: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm ; ; GFX7-LABEL: lds_ds_fadd: ; GFX7: ; %bb.0: @@ -566,82 +842,319 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: lds_ds_fadd_one_as: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s3, s3, 4 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_lshl_b32 s8, s3, 3 -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: ds_add_rtn_f32 v1, v2, v1 -; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 -; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: ds_add_f32 v2, v1 -; GFX9-NEXT: .LBB3_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_add_f32_e32 v2, s8, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: .LBB3_5: ; %ComputeLoop -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_ff1_i32_b64 s3, s[4:5] -; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3 -; GFX9-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-NEXT: v_readlane_b32 s9, v2, s3 -; GFX9-NEXT: s_mov_b32 m0, s3 -; GFX9-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: v_writelane_b32 v0, s8, m0 -; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: v_add_f32_e32 v1, s9, v1 -; GFX9-NEXT: s_cbranch_scc1 .LBB3_5 -; GFX9-NEXT: ; %bb.6: ; %ComputeEnd -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB3_8 -; GFX9-NEXT: ; %bb.7: -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX9-NEXT: .LBB3_8: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_f32_e32 v0, s2, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX908-LABEL: lds_ds_fadd_one_as: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_add_i32 s3, s3, 4 +; GFX908-NEXT: ; implicit-def: $vgpr1 +; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX908-NEXT: s_cbranch_execz .LBB3_2 +; GFX908-NEXT: ; %bb.1: +; GFX908-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX908-NEXT: s_lshl_b32 s8, s3, 3 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX908-NEXT: .LBB3_2: +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_mov_b64 s[6:7], exec +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB3_4 +; GFX908-NEXT: ; %bb.3: +; GFX908-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX908-NEXT: s_lshl_b32 s3, s3, 4 +; GFX908-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: ds_add_f32 v2, v1 +; GFX908-NEXT: .LBB3_4: +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX908-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX908-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX908-NEXT: ; implicit-def: $vgpr0 +; GFX908-NEXT: .LBB3_5: ; %ComputeLoop +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_ff1_i32_b64 s3, s[4:5] +; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX908-NEXT: v_readfirstlane_b32 s8, v1 +; GFX908-NEXT: v_readlane_b32 s9, v2, s3 +; GFX908-NEXT: s_mov_b32 m0, s3 +; GFX908-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX908-NEXT: v_writelane_b32 v0, s8, m0 +; GFX908-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX908-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX908-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX908-NEXT: ; %bb.6: ; %ComputeEnd +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX908-NEXT: ; implicit-def: $vgpr2 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX908-NEXT: s_cbranch_execz .LBB3_8 +; GFX908-NEXT: ; %bb.7: +; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX908-NEXT: .LBB3_8: +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_readfirstlane_b32 s2, v2 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX908-NEXT: global_store_dword v1, v0, s[0:1] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: lds_ds_fadd_one_as: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_add_i32 s3, s3, 4 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB3_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: s_lshl_b32 s8, s3, 3 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX90A-NEXT: .LBB3_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], exec +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB3_4 +; GFX90A-NEXT: ; %bb.3: +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX90A-NEXT: s_lshl_b32 s3, s3, 4 +; GFX90A-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s3 +; GFX90A-NEXT: ds_add_f32 v2, v1 +; GFX90A-NEXT: .LBB3_4: +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX90A-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX90A-NEXT: ; implicit-def: $vgpr0 +; GFX90A-NEXT: .LBB3_5: ; %ComputeLoop +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_ff1_i32_b64 s3, s[4:5] +; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX90A-NEXT: v_readfirstlane_b32 s8, v1 +; GFX90A-NEXT: v_readlane_b32 s9, v2, s3 +; GFX90A-NEXT: s_mov_b32 m0, s3 +; GFX90A-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX90A-NEXT: v_writelane_b32 v0, s8, m0 +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX90A-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB3_8 +; GFX90A-NEXT: ; %bb.7: +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX90A-NEXT: .LBB3_8: +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_readfirstlane_b32 s2, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: lds_ds_fadd_one_as: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_add_i32 s3, s3, 4 +; GFX940-NEXT: ; implicit-def: $vgpr1 +; GFX940-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX940-NEXT: s_cbranch_execz .LBB3_2 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX940-NEXT: s_lshl_b32 s8, s3, 3 +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s4 +; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX940-NEXT: .LBB3_2: +; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX940-NEXT: s_mov_b64 s[6:7], exec +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_readfirstlane_b32 s8, v1 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v1, s6, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v1, s7, v1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB3_4 +; GFX940-NEXT: ; %bb.3: +; GFX940-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX940-NEXT: s_lshl_b32 s3, s3, 4 +; GFX940-NEXT: v_mul_f32_e32 v1, 0x42280000, v1 +; GFX940-NEXT: v_mov_b32_e32 v2, s3 +; GFX940-NEXT: ds_add_f32 v2, v1 +; GFX940-NEXT: .LBB3_4: +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX940-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX940-NEXT: s_mov_b64 s[4:5], exec +; GFX940-NEXT: v_add_f32_e32 v2, s8, v0 +; GFX940-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX940-NEXT: ; implicit-def: $vgpr0 +; GFX940-NEXT: .LBB3_5: ; %ComputeLoop +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_ff1_i32_b64 s3, s[4:5] +; GFX940-NEXT: s_lshl_b64 s[6:7], 1, s3 +; GFX940-NEXT: v_readfirstlane_b32 s8, v1 +; GFX940-NEXT: v_readlane_b32 s9, v2, s3 +; GFX940-NEXT: s_mov_b32 m0, s3 +; GFX940-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] +; GFX940-NEXT: v_writelane_b32 v0, s8, m0 +; GFX940-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX940-NEXT: v_add_f32_e32 v1, s9, v1 +; GFX940-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX940-NEXT: ; %bb.6: ; %ComputeEnd +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX940-NEXT: ; implicit-def: $vgpr2 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX940-NEXT: s_cbranch_execz .LBB3_8 +; GFX940-NEXT: ; %bb.7: +; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 +; GFX940-NEXT: .LBB3_8: +; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_readfirstlane_b32 s2, v2 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v0, s2, v0 +; GFX940-NEXT: global_store_dword v1, v0, s[0:1] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX12-LABEL: lds_ds_fadd_one_as: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX12-NEXT: s_mov_b32 s5, exec_lo +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_co_i32 s3, s3, 4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX12-NEXT: s_cbranch_execz .LBB3_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s5 +; GFX12-NEXT: s_lshl_b32 s5, s3, 3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1 +; GFX12-NEXT: .LBB3_2: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_mov_b32 s6, exec_lo +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s5, v1 +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX12-NEXT: s_cbranch_execz .LBB3_4 +; GFX12-NEXT: ; %bb.3: +; GFX12-NEXT: s_bcnt1_i32_b32 s6, s6 +; GFX12-NEXT: s_lshl_b32 s3, s3, 4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v1, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1 +; GFX12-NEXT: ds_add_f32 v2, v1 +; GFX12-NEXT: .LBB3_4: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX12-NEXT: s_mov_b32 s4, exec_lo +; GFX12-NEXT: s_brev_b32 s3, 1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mul_f32_e32 v0, 0x42280000, v0 +; GFX12-NEXT: v_add_f32_e32 v1, s5, v0 +; GFX12-NEXT: ; implicit-def: $vgpr0 +; GFX12-NEXT: .LBB3_5: ; %ComputeLoop +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_ctz_i32_b32 s5, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_readlane_b32 s6, v1, s5 +; GFX12-NEXT: s_lshl_b32 s7, 1, s5 +; GFX12-NEXT: v_writelane_b32 v0, s3, s5 +; GFX12-NEXT: s_and_not1_b32 s4, s4, s7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u32 s4, 0 +; GFX12-NEXT: s_add_f32 s3, s3, s6 +; GFX12-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX12-NEXT: ; %bb.6: ; %ComputeEnd +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX12-NEXT: ; implicit-def: $vgpr1 +; GFX12-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX12-NEXT: s_xor_b32 s4, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execz .LBB3_8 +; GFX12-NEXT: ; %bb.7: +; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 +; GFX12-NEXT: .LBB3_8: +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm ; ; GFX7-LABEL: lds_ds_fadd_one_as: ; GFX7: ; %bb.0: @@ -858,27 +1371,73 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_ret_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: ds_read_b64 v[0:1], v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 -; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB4_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_ret_f64: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: ds_read_b64 v[0:1], v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v1 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB4_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_ret_f64: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_ret_f64: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_ret_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, v0 +; GFX12-NEXT: ds_load_b64 v[0:1], v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB4_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_ret_f64: ; GFX7: ; %bb.0: @@ -952,26 +1511,70 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_noret_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b64 v[1:2], v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 -; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB5_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_noret_f64: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b64 v[1:2], v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX908-NEXT: v_mov_b32_e32 v1, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_noret_f64: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: ds_add_f64 v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_noret_f64: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: ds_add_f64 v0, v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_noret_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b64 v[1:2], v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2] +; GFX12-NEXT: v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB5_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_f64: ; GFX7: ; %bb.0: @@ -1043,26 +1646,97 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin ; VI-NEXT: v_mov_b32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fsub_ret_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v2, v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: v_sub_f32_e32 v2, v3, v1 -; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB6_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fsub_ret_f32: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB6_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fsub_ret_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fsub_ret_f32: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB6_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fsub_ret_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fsub_ret_f32: ; GFX7: ; %bb.0: @@ -1133,25 +1807,92 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fsub_noret_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v2, v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_sub_f32_e32 v3, v2, v1 -; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB7_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fsub_noret_f32: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB7_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fsub_noret_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fsub_noret_f32: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB7_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fsub_noret_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB7_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fsub_noret_f32: ; GFX7: ; %bb.0: @@ -1223,28 +1964,103 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw ; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fsub_ret_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b64 v[3:4], v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] -; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB8_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fsub_ret_f64: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b64 v[3:4], v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX908-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB8_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: v_mov_b32_e32 v1, v4 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fsub_ret_f64: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: ds_read_b64 v[0:1], v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[0:1], v[6:7], -v[4:5] +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[6:7], v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fsub_ret_f64: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v5, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v0 +; GFX940-NEXT: v_mov_b32_e32 v4, v1 +; GFX940-NEXT: ds_read_b64 v[0:1], v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] +; GFX940-NEXT: v_add_f64 v[0:1], v[6:7], -v[4:5] +; GFX940-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[6:7], v[0:1] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB8_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fsub_ret_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b64 v[3:4], v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_f64_e64 v[3:4], v[5:6], -v[1:2] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[5:6] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[5:6] +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB8_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fsub_ret_f64: ; GFX7: ; %bb.0: @@ -1320,26 +2136,97 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fsub_noret_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b64 v[3:4], v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] -; GFX9-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v4, v6 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB9_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fsub_noret_f64: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b64 v[3:4], v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX908-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v3, v5 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v4, v6 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB9_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fsub_noret_f64: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b64 v[4:5], v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_add_f64 v[6:7], v[4:5], -v[2:3] +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v0, v[4:5], v[6:7] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fsub_noret_f64: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b64 v[4:5], v0 +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_add_f64 v[6:7], v[4:5], -v[2:3] +; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v0, v[4:5], v[6:7] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB9_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fsub_noret_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b64 v[3:4], v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_add_f64_e64 v[5:6], v[3:4], -v[1:2] +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b64 v[5:6], v0, v[5:6], v[3:4] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[5:6], v[3:4] +; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_mov_b32 v4, v6 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fsub_noret_f64: ; GFX7: ; %bb.0: @@ -1427,41 +2314,164 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_ret_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX9-NEXT: ds_read_b32 v3, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_or_b32 v3, v4, v2, v3 -; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB10_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_ret_bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX908-NEXT: ds_read_b32 v3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX908-NEXT: v_not_b32_e32 v2, v2 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB10_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_ret_bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v3 +; GFX90A-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX90A-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX90A-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_ret_bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX940-NEXT: ds_read_b32 v3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX940-NEXT: v_not_b32_e32 v2, v2 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v3 +; GFX940-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX940-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v5, v5, v3, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v1, v4, v3 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB10_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_ret_bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: ds_load_b32 v3, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_not_b32_e32 v2, v2 +; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB10_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_lshrrev_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_ret_bf16: ; GFX7: ; %bb.0: @@ -1572,40 +2582,160 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind { ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_noret_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, -4, v0 -; GFX9-NEXT: ds_read_b32 v3, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_or_b32 v4, v3, v2, v4 -; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB11_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_noret_bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX908-NEXT: ds_read_b32 v3, v1 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX908-NEXT: s_mov_b32 s4, 0xffff +; GFX908-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX908-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX908-NEXT: v_not_b32_e32 v2, v2 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_movk_i32 s6, 0x7fff +; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX908-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX908-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX908-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB11_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_noret_bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX90A-NEXT: ds_read_b32 v3, v1 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX90A-NEXT: s_mov_b32 s4, 0xffff +; GFX90A-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX90A-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_movk_i32 s6, 0x7fff +; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX90A-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_noret_bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX940-NEXT: ds_read_b32 v3, v1 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX940-NEXT: s_mov_b32 s0, 0xffff +; GFX940-NEXT: v_and_b32_e32 v0, 24, v2 +; GFX940-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX940-NEXT: v_not_b32_e32 v2, v2 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_movk_i32 s2, 0x7fff +; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX940-NEXT: v_add3_u32 v5, v5, v4, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_and_or_b32 v4, v3, v2, v4 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v1, v3, v4 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB11_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_noret_bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v1, -4, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: ds_load_b32 v2, v1 +; GFX12-NEXT: v_lshlrev_b32_e64 v3, v0, 0xffff +; GFX12-NEXT: v_and_b32_e32 v0, 24, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_not_b32_e32 v3, v3 +; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_f32_e32 v4, 4.0, v4 +; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v4 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB11_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_bf16: ; GFX7: ; %bb.0: @@ -1692,6 +2822,20 @@ define float @lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX7-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1757,6 +2901,20 @@ define void @lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrspac ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_add_f32 v0, v1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: s_setpc_b64 s[30:31] +; ; GFX7-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1827,26 +2985,97 @@ define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> % ; VI-NEXT: v_mov_b32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_ret_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v2, v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, v2 -; GFX9-NEXT: v_pk_add_f16 v2, v3, v1 -; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB14_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, v2 +; GFX908-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB14_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB14_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_ret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB14_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_ret_v2f16: ; GFX7: ; %bb.0: @@ -1959,25 +3188,92 @@ define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val) ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_noret_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v2, v0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 -; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v3, v2, v1 -; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_cbranch_execnz .LBB15_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX908-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX908-NEXT: v_mov_b32_e32 v2, v3 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB15_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v2, v3 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX940-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v2, v3 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB15_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_noret_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_mov_b32_e32 v2, v3 +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB15_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_v2f16: ; GFX7: ; %bb.0: @@ -2106,44 +3402,171 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo ; VI-NEXT: v_mov_b32_e32 v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_ret_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v2, v0 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: s_mov_b32 s9, 0x7060302 -; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s8 -; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX9-NEXT: v_perm_b32 v2, v5, v2, s9 -; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB16_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b32 v2, v0 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX908-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX908-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB16_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v0, v2 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b32 v2, v0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v2 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v2, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v2, v5, v2, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v2, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v4, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX940-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v2, v5, v2, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v2, v0, v4, v2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB16_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_ret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b32 v2, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX12-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX12-NEXT: v_add_f32_e32 v2, v2, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 +; GFX12-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 +; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB16_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_ret_v2bf16: ; GFX7: ; %bb.0: @@ -2267,43 +3690,166 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v ; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: lds_atomic_fadd_noret_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v3, v0 -; GFX9-NEXT: s_mov_b64 s[6:7], 0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: s_mov_b32 s9, 0x7060302 -; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v1 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s8 -; GFX9-NEXT: v_add3_u32 v8, v8, v5, s8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s9 -; GFX9-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GFX9-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_cbranch_execnz .LBB17_1 -; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX908-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: ds_read_b32 v3, v0 +; GFX908-NEXT: s_mov_b64 s[6:7], 0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX908-NEXT: s_movk_i32 s8, 0x7fff +; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX908-NEXT: s_mov_b32 s9, 0x7060302 +; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX908-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX908-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX908-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX908-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX908-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX908-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX908-NEXT: v_mov_b32_e32 v3, v4 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_cbranch_execnz .LBB17_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ds_read_b32 v3, v0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX90A-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v4, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v5, v4, s9 +; GFX90A-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_read_b32 v3, v0 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX940-NEXT: s_movk_i32 s4, 0x7fff +; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX940-NEXT: s_mov_b32 s5, 0x7060302 +; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX940-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX940-NEXT: v_add_f32_e32 v5, v5, v1 +; GFX940-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX940-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX940-NEXT: v_add3_u32 v8, v8, v5, s4 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v4, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[0:1] +; GFX940-NEXT: v_perm_b32 v4, v5, v4, s5 +; GFX940-NEXT: ds_cmpst_rtn_b32 v4, v0, v3, v4 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB17_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: lds_atomic_fadd_noret_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: ds_load_b32 v3, v0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX12-NEXT: s_mov_b32 s1, 0 +; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX12-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX12-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX12-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302 +; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_SE +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v4 +; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_cbranch_execnz .LBB17_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: lds_atomic_fadd_noret_v2bf16: ; GFX7: ; %bb.0: diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 17318b2c62ca8c..c9366f4434b1c1 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -1750,7 +1750,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] +; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 @@ -1766,7 +1766,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] +; GFX9-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4 @@ -1803,7 +1803,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] +; CI-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]] ; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1819,7 +1819,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] +; GFX9-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1835,7 +1835,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] +; GFX908-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1859,7 +1859,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) % ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]] +; GFX11-NEXT: [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64 ; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8 @@ -1880,7 +1880,7 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] +; CI-NEXT: [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]] ; CI-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 ; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 @@ -2102,7 +2102,7 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { ; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR9:[0-9]+]] +; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR10:[0-9]+]] ; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -2115,7 +2115,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; ALL-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]] +; ALL-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]] ; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -4669,11 +4669,180 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ret void } +define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { +; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] +; ALL: atomicrmw.start: +; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; ALL: atomicrmw.end: +; ALL-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } attributes #3 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #4 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #5 = { "denormal-fp-math"="dynamic,dynamic" } +attributes #6 = { "amdgpu-unsafe-fp-atomics"="true" } !0 = !{} From 460408f78b30720950040e336f7b566aa7203269 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 13:46:09 +0100 Subject: [PATCH 62/82] Reapply "[MLIR][Flang][DebugInfo] Set debug info format in MLIR->IR translation (#95098)" Reapplies the original patch with some additional conversion layers added to the MLIR translator, to ensure that we don't write the new debug info format unless WriteNewDbgInfoFormat is set. This reverts commit 8c5d9c79b96ed8297b381e00d3a706a432cd6c9d. --- flang/lib/Frontend/FrontendActions.cpp | 9 +++++++++ mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 10 ++++++++++ 2 files changed, 19 insertions(+) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index b1b6391f1439c6..a4db944e8c0abd 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -50,6 +50,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeWriterPass.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/IR/DebugProgramInstruction.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" @@ -81,6 +82,8 @@ using namespace Fortran::frontend; llvm::PassPluginLibraryInfo get##Ext##PluginInfo(); #include "llvm/Support/Extension.def" +extern llvm::cl::opt WriteNewDbgInfoFormat; + /// Save the given \c mlirModule to a temporary .mlir file, in a location /// decided by the -save-temps flag. No files are produced if the flag is not /// specified. @@ -1271,6 +1274,12 @@ void CodeGenAction::executeAction() { runOptimizationPipeline(ci.isOutputStreamNull() ? *os : ci.getOutputStream()); if (action == BackendActionTy::Backend_EmitLL) { + // When printing LLVM IR, we should convert the module to the debug info + // format that LLVM expects us to print. + llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, + WriteNewDbgInfoFormat); + if (WriteNewDbgInfoFormat) + llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(ci.isOutputStreamNull() ? *os : ci.getOutputStream(), /*AssemblyAnnotationWriter=*/nullptr); return; diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 7b86b250c294b4..e1a60f195fe89c 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -64,6 +64,8 @@ using namespace mlir; using namespace mlir::LLVM; using namespace mlir::LLVM::detail; +extern llvm::cl::opt UseNewDbgInfoFormat; + #include "mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc" namespace { @@ -1789,6 +1791,9 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext, StringRef name) { m->getContext()->getOrLoadDialect(); auto llvmModule = std::make_unique(name, llvmContext); + // ModuleTranslation can currently only construct modules in the old debug + // info format, so set the flag accordingly. + llvmModule->setNewDbgInfoFormatFlag(false); if (auto dataLayoutAttr = m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) { llvmModule->setDataLayout(cast(dataLayoutAttr).getValue()); @@ -1867,6 +1872,11 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, if (failed(translator.convertFunctions())) return nullptr; + // Once we've finished constructing elements in the module, we should convert + // it to use the debug info format desired by LLVM. + // See https://llvm.org/docs/RemoveDIsDebugInfo.html + translator.llvmModule->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); + if (!disableVerification && llvm::verifyModule(*translator.llvmModule, &llvm::errs())) return nullptr; From 3cc2710e0dd53bb82742904fa13014018a1137ed Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 13:41:23 +0100 Subject: [PATCH 63/82] [MLIR][Flang][DebugInfo] Convert debug format in MLIR translators Following from the previous commit, this patch converts to the appropriate debug info format before printing LLVM IR. See: https://github.com/llvm/llvm-project/pull/95098 --- flang/lib/Frontend/FrontendActions.cpp | 1 + mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 10 ++++++++++ mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp | 10 ++++++++++ 3 files changed, 21 insertions(+) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index a4db944e8c0abd..a74d2be9be3b1c 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -1276,6 +1276,7 @@ void CodeGenAction::executeAction() { if (action == BackendActionTy::Backend_EmitLL) { // When printing LLVM IR, we should convert the module to the debug info // format that LLVM expects us to print. + // See https://llvm.org/docs/RemoveDIsDebugInfo.html llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, WriteNewDbgInfoFormat); if (WriteNewDbgInfoFormat) diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp index 45588937795348..be3b36c7620559 100644 --- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp @@ -16,9 +16,12 @@ #include "mlir/Target/LLVMIR/Dialect/All.h" #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Tools/mlir-translate/Translation.h" +#include "llvm/IR/DebugProgramInstruction.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +extern llvm::cl::opt WriteNewDbgInfoFormat; + using namespace mlir; namespace mlir { @@ -31,6 +34,13 @@ void registerToLLVMIRTranslation() { if (!llvmModule) return failure(); + // When printing LLVM IR, we should convert the module to the debug info + // format that LLVM expects us to print. + // See https://llvm.org/docs/RemoveDIsDebugInfo.html + llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, + WriteNewDbgInfoFormat); + if (WriteNewDbgInfoFormat) + llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); }, diff --git a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp index 57e7d658fb501f..813b4960faa94d 100644 --- a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp +++ b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp @@ -22,6 +22,9 @@ #include "mlir/Tools/mlir-translate/Translation.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/IR/DebugProgramInstruction.h" + +extern llvm::cl::opt WriteNewDbgInfoFormat; using namespace mlir; @@ -122,6 +125,13 @@ void registerTestToLLVMIR() { if (!llvmModule) return failure(); + // When printing LLVM IR, we should convert the module to the debug info + // format that LLVM expects us to print. + // See https://llvm.org/docs/RemoveDIsDebugInfo.html + llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, + WriteNewDbgInfoFormat); + if (WriteNewDbgInfoFormat) + llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); }, From d4c6478cf2c0a8e2bebc66eb7eff4d0f11626d9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Tue, 11 Jun 2024 13:56:45 +0100 Subject: [PATCH 64/82] [mlir][vector] Update tests for collapse 1/n (nfc) (#94490) The main goal of this PR (and subsequent PRs), is to add more tests with scalable vectors to: * vector-transfer-collapse-inner-most-dims.mlir There's quite a few cases to consider, hence this is split into multiple PRs. In this PR, the very first test is complemented with all the possible combinations: * scalable (rather than fixed) unit trailing dim, * dynamic (rather than static) trailing dim in the source memref. Also, * `@leading_scalable_dimension_transfer_read` and `@trailing_scalable_one_dim_transfer_read`, are replaced with: * `@contiguous_inner_most_scalable_inner_dim` and `@negative_scalable_unit_dim`, respectively, and added to the list above (i.e. alongside other variations for the very first test). In addition: * "_view" is removed from function names (it's not clear to me what it was meant to signify) * extra comments are added to separate tests for vector.transfer_read and vector.transfer_write NOTE: This PR is limited to tests for `vector.transfer_read`. --- ...tor-transfer-collapse-inner-most-dims.mlir | 78 ++++++++++++++----- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir index b4cb640108baee..9b23681dba6a8a 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir @@ -1,12 +1,17 @@ // RUN: mlir-opt %s -test-vector-transfer-collapse-inner-most-dims -split-input-file | FileCheck %s -func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ +//----------------------------------------------------------------------------- +// 1. vector.transfer_read +//----------------------------------------------------------------------------- + +func.func @contiguous_inner_most(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f32 %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> return %0 : vector<1x8x1xf32> } -// CHECK: func @contiguous_inner_most_view(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> + +// CHECK: func @contiguous_inner_most(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> // CHECK: %[[SRC_0:.+]] = memref.subview %[[SRC]] // CHECK-SAME: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> to memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>> // CHECK: %[[VEC:.+]] = vector.transfer_read %[[SRC_0]] @@ -14,15 +19,61 @@ func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8, // CHECK: %[[RESULT:.+]] = vector.shape_cast %[[VEC]] // CHECK: return %[[RESULT]] +// Same as the top example within this split, but with the inner vector +// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e. +// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute. + +func.func @contiguous_inner_most_scalable_inner_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x[8]x1xf32>{ + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x[8]x1xf32> + return %0 : vector<1x[8]x1xf32> +} + +// CHECK: func @contiguous_inner_most_scalable_inner_dim(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> +// CHECK: %[[SRC_0:.+]] = memref.subview %[[SRC]] +// CHECK-SAME: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> to memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>> +// CHECK: %[[VEC:.+]] = vector.transfer_read %[[SRC_0]] +// CHECK-SAME: memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>, vector<1x[8]xf32> +// CHECK: %[[RESULT:.+]] = vector.shape_cast %[[VEC]] +// CHECK: return %[[RESULT]] + +// Same as the top example within this split, but the trailing unit dim was +// replaced with a dyn dim - not supported + +func.func @non_unit_trailing_dim(%in: memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{ + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32> + return %0 : vector<1x8x1xf32> +} + +// CHECK-LABEL: func @non_unit_trailing_dim +// CHECK-NOT: memref.subview +// CHECK-NOT: vector.shape_cast + +// Same as the top example within this split, but with a scalable unit dim in +// the output vector - not supported + +func.func @negative_scalable_unit_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x[1]xf32>{ + %c0 = arith.constant 0 : index + %cst = arith.constant 0.0 : f32 + %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x[1]xf32> + return %0 : vector<1x8x[1]xf32> +} +// CHECK-LABEL: func @negative_scalable_unit_dim +// CHECK-NOT: memref.subview +// CHECK-NOT: vector.shape_cast + // ----- -func.func @contiguous_outer_dyn_inner_most_view(%a: index, %b: index, %memref: memref) -> vector<8x1xf32> { +func.func @contiguous_outer_dyn_inner_most(%a: index, %b: index, %memref: memref) -> vector<8x1xf32> { %c0 = arith.constant 0 : index %pad = arith.constant 0.0 : f32 %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref, vector<8x1xf32> return %v : vector<8x1xf32> } -// CHECK: func.func @contiguous_outer_dyn_inner_most_view( +// CHECK: func.func @contiguous_outer_dyn_inner_most( // CHECK-SAME: %[[IDX0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[IDX1:[a-zA-Z0-9]+]] // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]] @@ -103,6 +154,10 @@ func.func @contiguous_inner_most_dim_out_of_bounds_2d(%arg0: memref<1x1xf32>) -> // ----- +//----------------------------------------------------------------------------- +// 2. vector.transfer_write +//----------------------------------------------------------------------------- + func.func @drop_two_inner_most_dim_for_transfer_write(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x16x1x1xf32>, %arg2: index) { %c0 = arith.constant 0 : index vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0] @@ -177,21 +232,6 @@ func.func @non_unit_strides(%arg0: memref<512x16x1xf32, strided<[8192, 16, 4], o // ----- -func.func @leading_scalable_dimension_transfer_read(%dest : memref<24x1xf32>) -> vector<[4]x1xf32> { - %c0 = arith.constant 0 : index - %pad = arith.constant 0.0 : f32 - %0 = vector.transfer_read %dest[%c0, %c0], %pad {in_bounds = [true, true]} : memref<24x1xf32>, vector<[4]x1xf32> - return %0 : vector<[4]x1xf32> -} -// CHECK: func.func @leading_scalable_dimension_transfer_read -// CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] -// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[DEST]][0, 0] [24, 1] [1, 1] : memref<24x1xf32> to memref<24xf32, strided<[1]>> -// CHECK: %[[READ:.+]] = vector.transfer_read %[[SUBVIEW]]{{.*}} {in_bounds = [true]} : memref<24xf32, strided<[1]>>, vector<[4]xf32> -// CHECK: %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<[4]xf32> to vector<[4]x1xf32> -// CHECK: return %[[CAST]] - -// ----- - // Negative test: [1] (scalable 1) is _not_ a unit dimension. func.func @trailing_scalable_one_dim_transfer_read(%dest : memref<24x1xf32>) -> vector<4x[1]xf32> { %c0 = arith.constant 0 : index From 1bae10879d9183c5edfb709c36b55086ebc772f0 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Tue, 11 Jun 2024 21:01:52 +0800 Subject: [PATCH 65/82] [clang-tidy] fix false positives for the functions with the same name as standard library functions in misc-include-cleaner (#94923) Fixes: #93335 For decl with body, we should provide physical locations also. Because it may be the function which have the same name as std library. --- clang-tools-extra/docs/ReleaseNotes.rst | 4 ++++ .../include-cleaner/lib/LocateSymbol.cpp | 8 ++++++-- .../include-cleaner/unittests/FindHeadersTest.cpp | 11 +++++++++++ .../test/clang-tidy/checkers/misc/include-cleaner.cpp | 8 ++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 2dc39d0ad74af8..6bf70c5cf4f8a5 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -337,6 +337,10 @@ Changes in existing checks ` check by avoiding crash for self include cycles. +- Improved :doc:`misc-include-cleaner + ` check by avoiding false positives for + the functions with the same name as standard library functions. + - Improved :doc:`misc-unused-using-decls ` check by replacing the local option `HeaderFileExtensions` by the global option of the same name. diff --git a/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp b/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp index 78e783a62eb27f..9148d36a5038f9 100644 --- a/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp +++ b/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp @@ -14,6 +14,7 @@ #include "clang/AST/DeclTemplate.h" #include "clang/Tooling/Inclusions/StandardLibrary.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" #include #include @@ -40,8 +41,11 @@ Hints declHints(const Decl *D) { std::vector> locateDecl(const Decl &D) { std::vector> Result; // FIXME: Should we also provide physical locations? - if (auto SS = tooling::stdlib::Recognizer()(&D)) - return {{*SS, Hints::CompleteSymbol}}; + if (auto SS = tooling::stdlib::Recognizer()(&D)) { + Result.push_back({*SS, Hints::CompleteSymbol}); + if (!D.hasBody()) + return Result; + } // FIXME: Signal foreign decls, e.g. a forward declaration not owned by a // library. Some useful signals could be derived by checking the DeclContext. // Most incidental forward decls look like: diff --git a/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp b/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp index 07302142a13e36..fdcbf25fd628c0 100644 --- a/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp +++ b/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp @@ -628,6 +628,17 @@ TEST_F(HeadersForSymbolTest, StandardHeaders) { tooling::stdlib::Header::named(""))); } +TEST_F(HeadersForSymbolTest, NonStandardHeaders) { + Inputs.Code = "void assert() {}"; + buildAST(); + EXPECT_THAT( + headersFor("assert"), + // Respect the ordering from the stdlib mapping. + UnorderedElementsAre(physicalHeader("input.mm"), + tooling::stdlib::Header::named(""), + tooling::stdlib::Header::named(""))); +} + TEST_F(HeadersForSymbolTest, ExporterNoNameMatch) { Inputs.Code = R"cpp( #include "exporter/foo.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp index e10ac3f46e2e9d..d5ea96b00254c2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp @@ -15,3 +15,11 @@ std::string HelloString; // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: no header providing "std::string" is directly included [misc-include-cleaner] int FooBarResult = foobar(); // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: no header providing "foobar" is directly included [misc-include-cleaner] + +namespace valid { + +namespace gh93335 { +void log2() {} +} // namespace gh93335 + +} // namespace valid From da5f45f5937d3cde4ff76aeeb208e72ee504baaf Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 4 Jun 2024 14:40:09 +0200 Subject: [PATCH 66/82] [ConstantFolding] Preserve nowrap flags in gep of gep fold A caveat here is that we can only preserve nusw if the offset additions did not overflow. Proofs: https://alive2.llvm.org/ce/z/u56z_u --- llvm/lib/Analysis/ConstantFolding.cpp | 23 +++++++++---- .../Transforms/InstCombine/getelementptr.ll | 34 +++++++++++++++++-- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 3ca3ae951fcd78..e0f5bf0ab83658 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -866,8 +866,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, ArrayRef Ops, const DataLayout &DL, const TargetLibraryInfo *TLI) { - bool InBounds = GEP->isInBounds(); - Type *SrcElemTy = GEP->getSourceElementType(); Type *ResTy = GEP->getType(); if (!SrcElemTy->isSized() || isa(SrcElemTy)) @@ -898,8 +896,10 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, InRange = InRange->sextOrTrunc(BitWidth); // If this is a GEP of a GEP, fold it all into a single GEP. + GEPNoWrapFlags NW = GEP->getNoWrapFlags(); + bool Overflow = false; while (auto *GEP = dyn_cast(Ptr)) { - InBounds &= GEP->isInBounds(); + NW &= GEP->getNoWrapFlags(); SmallVector NestedOps(llvm::drop_begin(GEP->operands())); @@ -923,9 +923,16 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, Ptr = cast(GEP->getOperand(0)); SrcElemTy = GEP->getSourceElementType(); - Offset += APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps)); + Offset = Offset.sadd_ov( + APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps)), + Overflow); } + // Preserving nusw (without inbounds) also requires that the offset + // additions did not overflow. + if (NW.hasNoUnsignedSignedWrap() && !NW.isInBounds() && Overflow) + NW = NW.withoutNoUnsignedSignedWrap(); + // If the base value for this address is a literal integer value, fold the // getelementptr to the resulting integer value casted to the pointer type. APInt BasePtr(BitWidth, 0); @@ -944,17 +951,19 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, } // Try to infer inbounds for GEPs of globals. - if (!InBounds && Offset.isNonNegative()) { + // TODO(gep_nowrap): Also infer nuw flag. + if (!NW.isInBounds() && Offset.isNonNegative()) { bool CanBeNull, CanBeFreed; uint64_t DerefBytes = Ptr->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); - InBounds = DerefBytes != 0 && !CanBeNull && Offset.sle(DerefBytes); + if (DerefBytes != 0 && !CanBeNull && Offset.sle(DerefBytes)) + NW |= GEPNoWrapFlags::inBounds(); } // Otherwise canonicalize this to a single ptradd. LLVMContext &Ctx = Ptr->getContext(); return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ctx), Ptr, - ConstantInt::get(Ctx, Offset), InBounds, + ConstantInt::get(Ctx, Offset), NW, InRange); } diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll index 871913610429b3..f968fa6f9afd0c 100644 --- a/llvm/test/Transforms/InstCombine/getelementptr.ll +++ b/llvm/test/Transforms/InstCombine/getelementptr.ll @@ -419,20 +419,48 @@ define ptr @test_index_canon_nusw_nuw(ptr %X, i32 %Idx) { ret ptr %R } -define ptr @test_index_canon_const_expr_inbounds(ptr %X, i32 %Idx) { +define ptr @test_index_canon_const_expr_inbounds() { ; CHECK-LABEL: @test_index_canon_const_expr_inbounds( ; CHECK-NEXT: ret ptr getelementptr inbounds (i8, ptr @Global, i64 123) ; ret ptr getelementptr inbounds (i8, ptr @Global, i32 123) } -define ptr @test_index_canon_const_expr_nuw_nusw(ptr %X, i32 %Idx) { +define ptr @test_index_canon_const_expr_nuw_nusw() { ; CHECK-LABEL: @test_index_canon_const_expr_nuw_nusw( -; CHECK-NEXT: ret ptr getelementptr (i8, ptr @Global, i64 123) +; CHECK-NEXT: ret ptr getelementptr nusw nuw (i8, ptr @Global, i64 123) ; ret ptr getelementptr nusw nuw (i8, ptr @Global, i32 123) } +define ptr @test_const_gep_gep_nuw() { +; CHECK-LABEL: @test_const_gep_gep_nuw( +; CHECK-NEXT: ret ptr getelementptr nuw (i8, ptr @Global, i64 246) +; + ret ptr getelementptr nuw (i8, ptr getelementptr nuw (i8, ptr @Global, i64 123), i64 123) +} + +define ptr @test_const_gep_gep_nusw_no_overflow() { +; CHECK-LABEL: @test_const_gep_gep_nusw_no_overflow( +; CHECK-NEXT: ret ptr getelementptr nusw (i8, ptr @Global, i64 246) +; + ret ptr getelementptr nusw (i8, ptr getelementptr nusw (i8, ptr @Global, i64 123), i64 123) +} + +define ptr @test_const_gep_gep_nusw_no_overflow_neg() { +; CHECK-LABEL: @test_const_gep_gep_nusw_no_overflow_neg( +; CHECK-NEXT: ret ptr getelementptr nusw (i8, ptr @Global, i64 -246) +; + ret ptr getelementptr nusw (i8, ptr getelementptr nusw (i8, ptr @Global, i64 -123), i64 -123) +} + +define ptr @test_const_gep_gep_nusw_overflow() { +; CHECK-LABEL: @test_const_gep_gep_nusw_overflow( +; CHECK-NEXT: ret ptr getelementptr (i8, ptr @Global, i64 -2) +; + ret ptr getelementptr nusw (i8, ptr getelementptr nusw (i8, ptr @Global, i64 u0x7fffffffffffffff), i64 u0x7fffffffffffffff) +} + define i1 @test17(ptr %P, i32 %I, i32 %J) { ; CHECK-LABEL: @test17( ; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[I:%.*]], [[J:%.*]] From 32add2435f9781418aa8ba8c90f9b7f0d2ac2f28 Mon Sep 17 00:00:00 2001 From: Paul T Robinson Date: Tue, 11 Jun 2024 06:04:09 -0700 Subject: [PATCH 67/82] Fix test to have correct requirements (#95106) --- llvm/test/CodeGen/ARM/apple-version-min.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/test/CodeGen/ARM/apple-version-min.ll b/llvm/test/CodeGen/ARM/apple-version-min.ll index 6b4af21d74c00d..180c19e5e36230 100644 --- a/llvm/test/CodeGen/ARM/apple-version-min.ll +++ b/llvm/test/CodeGen/ARM/apple-version-min.ll @@ -1,5 +1,8 @@ ; Test emitting version_min directives. +; Let's not split this into separate ARM/AArch64 parts. +; REQUIRES: aarch64-registered-target + ; RUN: llc %s -filetype=asm -o - --mtriple arm64-apple-tvos9.0.0 | FileCheck %s --check-prefix=TVOS ; RUN: llc %s -filetype=asm -o - --mtriple thumbv7s-apple-ios7.0.0 | FileCheck %s --check-prefix=IOS ; RUN: llc %s -filetype=asm -o - --mtriple thumbv7k-apple-watchos2.0.0 | FileCheck %s --check-prefix=WATCHOS From e805b77107c8a26ad129fb2a46cdec01c90628be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 11 Jun 2024 13:25:02 +0200 Subject: [PATCH 68/82] [clang][Interp] Support ObjCEncodeExprs --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 11 +++++++++++ clang/lib/AST/Interp/ByteCodeExprGen.h | 1 + clang/test/CodeGenObjC/encode-test-3.m | 1 + 3 files changed, 13 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 0899a98b3b95a6..0385ca4b3a0639 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1688,6 +1688,17 @@ bool ByteCodeExprGen::VisitObjCStringLiteral( return this->delegate(E->getString()); } +template +bool ByteCodeExprGen::VisitObjCEncodeExpr(const ObjCEncodeExpr *E) { + auto &A = Ctx.getASTContext(); + std::string Str; + A.getObjCEncodingForType(E->getEncodedType(), Str); + StringLiteral *SL = + StringLiteral::Create(A, Str, StringLiteralKind::Ordinary, + /*Pascal=*/false, E->getType(), E->getAtLoc()); + return this->delegate(SL); +} + template bool ByteCodeExprGen::VisitSYCLUniqueStableNameExpr( const SYCLUniqueStableNameExpr *E) { diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h index 7ab14b6ab383e8..295cfef0525cd8 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.h +++ b/clang/lib/AST/Interp/ByteCodeExprGen.h @@ -91,6 +91,7 @@ class ByteCodeExprGen : public ConstStmtVisitor, bool>, bool VisitAbstractConditionalOperator(const AbstractConditionalOperator *E); bool VisitStringLiteral(const StringLiteral *E); bool VisitObjCStringLiteral(const ObjCStringLiteral *E); + bool VisitObjCEncodeExpr(const ObjCEncodeExpr *E); bool VisitSYCLUniqueStableNameExpr(const SYCLUniqueStableNameExpr *E); bool VisitCharacterLiteral(const CharacterLiteral *E); bool VisitCompoundAssignOperator(const CompoundAssignOperator *E); diff --git a/clang/test/CodeGenObjC/encode-test-3.m b/clang/test/CodeGenObjC/encode-test-3.m index 0856b770d65a50..30557fccf02df7 100644 --- a/clang/test/CodeGenObjC/encode-test-3.m +++ b/clang/test/CodeGenObjC/encode-test-3.m @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple=i686-apple-darwin9 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple=i686-apple-darwin9 -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s int main(void) { int n; From 400d4fd7b6dea9c7cdd255bb804fcd0ee77f6d42 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 14:16:32 +0100 Subject: [PATCH 69/82] [RemoveDIs] Update all docs to use debug records (#91768) As we approach the state where support for debug intrinsics is dropping and we print and use debug records by default, the documentation should be updated to refer to debug records as the primary debug info representation, with debug intrinsics being relegated to an optional alternative. This patch performs a few updates: - Replace references to intrinsics with references to records across all the documentation. - Replace intrinsics with records in code examples. - Move debug records prior to debug intrinsics in the SourceLevelDebugging document, and change text to refer to them as the primary representation. - Add release notes describing the change. --- llvm/docs/AssignmentTracking.md | 113 +++--- llvm/docs/HowToUpdateDebugInfo.rst | 16 +- llvm/docs/InstrRefDebugInfo.md | 2 +- llvm/docs/LangRef.rst | 83 ++-- llvm/docs/MIRLangRef.rst | 8 +- llvm/docs/Passes.rst | 5 +- llvm/docs/ReleaseNotes.rst | 4 + llvm/docs/SourceLevelDebugging.rst | 376 +++++++++--------- .../MyFirstLanguageFrontend/LangImpl09.rst | 2 +- 9 files changed, 313 insertions(+), 296 deletions(-) diff --git a/llvm/docs/AssignmentTracking.md b/llvm/docs/AssignmentTracking.md index 5a8bc5844eef6b..a24a8b0d797f87 100644 --- a/llvm/docs/AssignmentTracking.md +++ b/llvm/docs/AssignmentTracking.md @@ -11,7 +11,7 @@ The core idea is to track more information about source assignments in order and preserve enough information to be able to defer decisions about whether to use non-memory locations (register, constant) or memory locations until after middle end optimisations have run. This is in opposition to using -`llvm.dbg.declare` and `llvm.dbg.value`, which is to make the decision for most +`#dbg_declare` and `#dbg_value`, which is to make the decision for most variables early on, which can result in suboptimal variable locations that may be either incorrect or incomplete. @@ -26,19 +26,18 @@ except for development and testing. **Enable in Clang**: `-Xclang -fexperimental-assignment-tracking` That causes Clang to get LLVM to run the pass `declare-to-assign`. The pass -converts conventional debug intrinsics to assignment tracking metadata and sets +converts conventional debug records to assignment tracking metadata and sets the module flag `debug-info-assignment-tracking` to the value `i1 true`. To check whether assignment tracking is enabled for a module call `isAssignmentTrackingEnabled(const Module &M)` (from `llvm/IR/DebugInfo.h`). ## Design and implementation -### Assignment markers: `llvm.dbg.assign` +### Assignment markers: `#dbg_assign` -`llvm.dbg.value`, a conventional debug intrinsic, marks out a position in the +`#dbg_value`, a conventional debug record, marks out a position in the IR where a variable takes a particular value. Similarly, Assignment Tracking -marks out the position of assignments with a new intrinsic called -`llvm.dbg.assign`. +marks out the position of assignments with a record called `#dbg_assign`. In order to know where in IR it is appropriate to use a memory location for a variable, each assignment marker must in some way refer to the store, if any @@ -48,24 +47,23 @@ important benefit of referring to the store is that we can then build a two-way mapping of stores<->markers that can be used to find markers that need to be updated when stores are modified. -An `llvm.dbg.assign` marker that is not linked to any instruction signals that +An `#dbg_assign` marker that is not linked to any instruction signals that the store that performed the assignment has been optimised out, and therefore the memory location will not be valid for at least some part of the program. -Here's the `llvm.dbg.assign` signature. Each parameter is wrapped in -`MetadataAsValue`, and `Value *` type parameters are first wrapped in -`ValueAsMetadata`: +Here's the `#dbg_assign` signature. `Value *` type parameters are first wrapped +in `ValueAsMetadata`: ``` -void @llvm.dbg.assign(Value *Value, - DIExpression *ValueExpression, - DILocalVariable *Variable, - DIAssignID *ID, - Value *Address, - DIExpression *AddressExpression) + #dbg_assign(Value *Value, + DIExpression *ValueExpression, + DILocalVariable *Variable, + DIAssignID *ID, + Value *Address, + DIExpression *AddressExpression) ``` -The first three parameters look and behave like an `llvm.dbg.value`. `ID` is a +The first three parameters look and behave like an `#dbg_value`. `ID` is a reference to a store (see next section). `Address` is the destination address of the store and it is modified by `AddressExpression`. An empty/undef/poison address means the address component has been killed (the memory address is no @@ -73,18 +71,13 @@ longer a valid location). LLVM currently encodes variable fragment information in `DIExpression`s, so as an implementation quirk the `FragmentInfo` for `Variable` is contained within `ValueExpression` only. -The formal LLVM-IR signature is: -``` -void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) -``` - ### Instruction link: `DIAssignID` `DIAssignID` metadata is the mechanism that is currently used to encode the store<->marker link. The metadata node has no operands and all instances are `distinct`; equality is checked for by comparing addresses. -`llvm.dbg.assign` intrinsics use a `DIAssignID` metadata node instance as an +`#dbg_assign` records use a `DIAssignID` metadata node instance as an operand. This way it refers to any store-like instruction that has the same `DIAssignID` attachment. E.g. For this test.cpp, @@ -102,9 +95,9 @@ we get: define dso_local noundef i32 @_Z3funi(i32 noundef %a) #0 !dbg !8 { entry: %a.addr = alloca i32, align 4, !DIAssignID !13 - call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata i32* %a.addr, metadata !DIExpression()), !dbg !15 + #dbg_assign(i1 undef, !14, !DIExpression(), !13, i32* %a.addr, !DIExpression(), !15) store i32 %a, i32* %a.addr, align 4, !DIAssignID !16 - call void @llvm.dbg.assign(metadata i32 %a, metadata !14, metadata !DIExpression(), metadata !16, metadata i32* %a.addr, metadata !DIExpression()), !dbg !15 + #dbg_assign(i32 %a, !14, !DIExpression(), !16, i32* %a.addr, !DIExpression(), !15) %0 = load i32, i32* %a.addr, align 4, !dbg !17 ret i32 %0, !dbg !18 } @@ -116,16 +109,16 @@ entry: !16 = distinct !DIAssignID() ``` -The first `llvm.dbg.assign` refers to the `alloca` through `!DIAssignID !13`, +The first `#dbg_assign` refers to the `alloca` through `!DIAssignID !13`, and the second refers to the `store` through `!DIAssignID !16`. ### Store-like instructions -In the absence of a linked `llvm.dbg.assign`, a store to an address that is +In the absence of a linked `#dbg_assign`, a store to an address that is known to be the backing storage for a variable is considered to represent an assignment to that variable. -This gives us a safe fall-back in cases where `llvm.dbg.assign` intrinsics have +This gives us a safe fall-back in cases where `#dbg_assign` records have been deleted, the `DIAssignID` attachment on the store has been dropped, or the optimiser has made a once-indirect store (not tracked with Assignment Tracking) direct. @@ -139,61 +132,61 @@ direct. instruction. In this case, the assignment is considered to take place in multiple positions in the program. -**Moving** a non-debug instruction: nothing new to do. Instructions linked to an -`llvm.dbg.assign` have their initial IR position marked by the position of the -`llvm.dbg.assign`. +**Moving** a non-debug instruction: nothing new to do. Instructions linked to a +`#dbg_assign` have their initial IR position marked by the position of the +`#dbg_assign`. **Deleting** a non-debug instruction: nothing new to do. Simple DSE does not require any change; it’s safe to delete an instruction with a `DIAssignID` -attachment. An `llvm.dbg.assign` that uses a `DIAssignID` that is not attached +attachment. A `#dbg_assign` that uses a `DIAssignID` that is not attached to any instruction indicates that the memory location isn’t valid. **Merging** stores: In many cases no change is required as `DIAssignID` attachments are automatically merged if `combineMetadata` is called. One way or another, the `DIAssignID` attachments must be merged such that new store -becomes linked to all the `llvm.dbg.assign` intrinsics that the merged stores +becomes linked to all the `#dbg_assign` records that the merged stores were linked to. This can be achieved simply by calling a helper function `Instruction::mergeDIAssignID`. -**Inlining** stores: As stores are inlined we generate `llvm.dbg.assign` -intrinsics and `DIAssignID` attachments as if the stores represent source +**Inlining** stores: As stores are inlined we generate `#dbg_assign` +records and `DIAssignID` attachments as if the stores represent source assignments, just like the in frontend. This isn’t perfect, as stores may have been moved, modified or deleted before inlining, but it does at least keep the information about the variable correct within the non-inlined scope. -**Splitting** stores: SROA and passes that split stores treat `llvm.dbg.assign` -intrinsics similarly to `llvm.dbg.declare` intrinsics. Clone the -`llvm.dbg.assign` intrinsics linked to the store, update the FragmentInfo in -the `ValueExpression`, and give the split stores (and cloned intrinsics) new +**Splitting** stores: SROA and passes that split stores treat `#dbg_assign` +records similarly to `#dbg_declare` records. Clone the +`#dbg_assign` records linked to the store, update the FragmentInfo in +the `ValueExpression`, and give the split stores (and cloned records) new `DIAssignID` attachments each. In other words, treat the split stores as separate assignments. For partial DSE (e.g. shortening a memset), we do the -same except that `llvm.dbg.assign` for the dead fragment gets an `Undef` +same except that `#dbg_assign` for the dead fragment gets an `Undef` `Address`. -**Promoting** allocas and store/loads: `llvm.dbg.assign` intrinsics implicitly +**Promoting** allocas and store/loads: `#dbg_assign` records implicitly describe joined values in memory locations at CFG joins, but this is not necessarily the case after promoting (or partially promoting) the variable. Passes that promote variables are responsible for inserting -`llvm.dbg.assign` intrinsics after the resultant PHIs generated during -promotion. `mem2reg` already has to do this (with `llvm.dbg.value`) for -`llvm.dbg.declare`s. Where a store has no linked intrinsic, the store is +`#dbg_assign` records after the resultant PHIs generated during +promotion. `mem2reg` already has to do this (with `#dbg_value`) for +`#dbg_declare`s. Where a store has no linked record, the store is assumed to represent an assignment for variables stored at the destination address. -#### Debug intrinsic updates +#### Debug record updates -**Moving** a debug intrinsic: avoid moving `llvm.dbg.assign` intrinsics where +**Moving** a debug record: avoid moving `#dbg_assign` records where possible, as they represent a source-level assignment, whose position in the program should not be affected by optimization passes. -**Deleting** a debug intrinsic: Nothing new to do. Just like for conventional -debug intrinsics, unless it is unreachable, it’s almost always incorrect to -delete a `llvm.dbg.assign` intrinsic. +**Deleting** a debug record: Nothing new to do. Just like for conventional +debug records, unless it is unreachable, it’s almost always incorrect to +delete a `#dbg_assign` record. -### Lowering `llvm.dbg.assign` to MIR +### Lowering `#dbg_assign` to MIR -To begin with only SelectionDAG ISel will be supported. `llvm.dbg.assign` -intrinsics are lowered to MIR `DBG_INSTR_REF` instructions. Before this happens +To begin with only SelectionDAG ISel will be supported. `#dbg_assign` +records are lowered to MIR `DBG_INSTR_REF` instructions. Before this happens we need to decide where it is appropriate to use memory locations and where we must use a non-memory location (or no location) for each variable. In order to make those decisions we run a standard fixed-point dataflow analysis that makes @@ -214,9 +207,9 @@ to tackle: clang/test/CodeGen/assignment-tracking/assignment-tracking.cpp for examples. * `trackAssignments` doesn't yet work for variables that have their - `llvm.dbg.declare` location modified by a `DIExpression`, e.g. when the + `#dbg_declare` location modified by a `DIExpression`, e.g. when the address of the variable is itself stored in an `alloca` with the - `llvm.dbg.declare` using `DIExpression(DW_OP_deref)`. See `indirectReturn` in + `#dbg_declare` using `DIExpression(DW_OP_deref)`. See `indirectReturn` in llvm/test/DebugInfo/Generic/assignment-tracking/track-assignments.ll and in clang/test/CodeGen/assignment-tracking/assignment-tracking.cpp for an example. @@ -225,13 +218,13 @@ to tackle: memory location is available without using a `DIAssignID`. This is because the storage address is not computed by an instruction (it's an argument value) and therefore we have nowhere to put the metadata attachment. To solve - this we probably need another marker intrinsic to denote "the variable's - stack home is X address" - similar to `llvm.dbg.declare` except that it needs - to compose with `llvm.dbg.assign` intrinsics such that the stack home address - is only selected as a location for the variable when the `llvm.dbg.assign` - intrinsics agree it should be. + this we probably need another marker record to denote "the variable's + stack home is X address" - similar to `#dbg_declare` except that it needs + to compose with `#dbg_assign` records such that the stack home address + is only selected as a location for the variable when the `#dbg_assign` + records agree it should be. -* Given the above (a special "the stack home is X" intrinsic), and the fact +* Given the above (a special "the stack home is X" record), and the fact that we can only track assignments with fixed offsets and sizes, I think we can probably get rid of the address and address-expression part, since it will always be computable with the info we have. diff --git a/llvm/docs/HowToUpdateDebugInfo.rst b/llvm/docs/HowToUpdateDebugInfo.rst index c64b5d1d0d98b6..db3465aa54d844 100644 --- a/llvm/docs/HowToUpdateDebugInfo.rst +++ b/llvm/docs/HowToUpdateDebugInfo.rst @@ -151,7 +151,7 @@ Deleting an IR-level Instruction When an ``Instruction`` is deleted, its debug uses change to ``undef``. This is a loss of debug info: the value of one or more source variables becomes -unavailable, starting with the ``llvm.dbg.value(undef, ...)``. When there is no +unavailable, starting with the ``#dbg_value(undef, ...)``. When there is no way to reconstitute the value of the lost instruction, this is the best possible outcome. However, it's often possible to do better: @@ -172,7 +172,7 @@ possible outcome. However, it's often possible to do better: define i16 @foo(i16 %a) { %b = sext i16 %a to i32 %c = and i32 %b, 15 - call void @llvm.dbg.value(metadata i32 %c, ...) + #dbg_value(i32 %c, ...) %d = trunc i32 %c to i16 ret i16 %d } @@ -183,7 +183,7 @@ replaced with a simplified instruction: .. code-block:: llvm define i16 @foo(i16 %a) { - call void @llvm.dbg.value(metadata i32 undef, ...) + #dbg_value(i32 undef, ...) %simplified = and i16 %a, 15 ret i16 %simplified } @@ -204,7 +204,7 @@ This results in better debug info because the debug use of ``%c`` is preserved: define i16 @foo(i16 %a) { %simplified = and i16 %a, 15 - call void @llvm.dbg.value(metadata i16 %simplified, ...) + #dbg_value(i16 %simplified, ...) ret i16 %simplified } @@ -249,7 +249,7 @@ module, and the second checks that this DI is still available after an optimization has occurred, reporting any errors/warnings while doing so. The instructions are assigned sequentially increasing line locations, and are -immediately used by debug value intrinsics everywhere possible. +immediately used by debug value records everywhere possible. For example, here is a module before: @@ -271,10 +271,10 @@ and after running ``opt -debugify``: define void @f(i32* %x) !dbg !6 { entry: %x.addr = alloca i32*, align 8, !dbg !12 - call void @llvm.dbg.value(metadata i32** %x.addr, metadata !9, metadata !DIExpression()), !dbg !12 + #dbg_value(i32** %x.addr, !9, !DIExpression(), !12) store i32* %x, i32** %x.addr, align 8, !dbg !13 %0 = load i32*, i32** %x.addr, align 8, !dbg !14 - call void @llvm.dbg.value(metadata i32* %0, metadata !11, metadata !DIExpression()), !dbg !14 + #dbg_value(i32* %0, !11, !DIExpression(), !14) store i32 10, i32* %0, align 4, !dbg !15 ret void, !dbg !16 } @@ -409,7 +409,7 @@ as follows: $ clang -Xclang -fverify-debuginfo-preserve -Xclang -fverify-debuginfo-preserve-export=sample.json -g -O2 sample.c Please do note that there are some known false positives, for source locations -and debug intrinsic checking, so that will be addressed as a future work. +and debug record checking, so that will be addressed as a future work. Mutation testing for MIR-level transformations ---------------------------------------------- diff --git a/llvm/docs/InstrRefDebugInfo.md b/llvm/docs/InstrRefDebugInfo.md index 3917989e4026df..eb7a0464b90a0c 100644 --- a/llvm/docs/InstrRefDebugInfo.md +++ b/llvm/docs/InstrRefDebugInfo.md @@ -24,7 +24,7 @@ referring to instruction values: ```llvm %2 = add i32 %0, %1 -call void @llvm.dbg.value(metadata i32 %2, + #dbg_value(metadata i32 %2, ``` In LLVM IR, the IR Value is synonymous with the instruction that computes the diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index c11a6627d81d31..f39b8dc6c90d47 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -6288,11 +6288,11 @@ DIExpression """""""""""" ``DIExpression`` nodes represent expressions that are inspired by the DWARF -expression language. They are used in :ref:`debug intrinsics` -(such as ``llvm.dbg.declare`` and ``llvm.dbg.value``) to describe how the +expression language. They are used in :ref:`debug records ` +(such as ``#dbg_declare`` and ``#dbg_value``) to describe how the referenced LLVM variable relates to the source language variable. Debug -intrinsics are interpreted left-to-right: start by pushing the value/address -operand of the intrinsic onto a stack, then repeatedly push and evaluate +expressions are interpreted left-to-right: start by pushing the value/address +operand of the record onto a stack, then repeatedly push and evaluate opcodes from the DIExpression until the final variable description is produced. The current supported opcode vocabulary is limited: @@ -6389,23 +6389,24 @@ The current supported opcode vocabulary is limited: IR for "*ptr = 4;" -------------- - call void @llvm.dbg.value(metadata i32 4, metadata !17, metadata !20) + #dbg_value(i32 4, !17, !DIExpression(DW_OP_LLVM_implicit_pointer), !20) !17 = !DILocalVariable(name: "ptr1", scope: !12, file: !3, line: 5, type: !18) !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64) !19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) - !20 = !DIExpression(DW_OP_LLVM_implicit_pointer)) + !20 = !DILocation(line: 10, scope: !12) IR for "**ptr = 4;" -------------- - call void @llvm.dbg.value(metadata i32 4, metadata !17, metadata !21) + #dbg_value(i32 4, !17, + !DIExpression(DW_OP_LLVM_implicit_pointer, DW_OP_LLVM_implicit_pointer), + !21) !17 = !DILocalVariable(name: "ptr1", scope: !12, file: !3, line: 5, type: !18) !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64) !19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64) !20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) - !21 = !DIExpression(DW_OP_LLVM_implicit_pointer, - DW_OP_LLVM_implicit_pointer)) + !21 = !DILocation(line: 10, scope: !12) DWARF specifies three kinds of simple location descriptions: Register, memory, and implicit location descriptions. Note that a location description is @@ -6416,45 +6417,48 @@ sense that a debugger might modify its value), whereas *implicit locations* describe merely the actual *value* of a source variable which might not exist in registers or in memory (see ``DW_OP_stack_value``). -A ``llvm.dbg.declare`` intrinsic describes an indirect value (the address) of a -source variable. The first operand of the intrinsic must be an address of some -kind. A DIExpression attached to the intrinsic refines this address to produce a +A ``#dbg_declare`` record describes an indirect value (the address) of a +source variable. The first operand of the record must be an address of some +kind. A DIExpression operand to the record refines this address to produce a concrete location for the source variable. -A ``llvm.dbg.value`` intrinsic describes the direct value of a source variable. -The first operand of the intrinsic may be a direct or indirect value. A -DIExpression attached to the intrinsic refines the first operand to produce a +A ``#dbg_value`` record describes the direct value of a source variable. +The first operand of the record may be a direct or indirect value. A +DIExpression operand to the record refines the first operand to produce a direct value. For example, if the first operand is an indirect value, it may be necessary to insert ``DW_OP_deref`` into the DIExpression in order to produce a -valid debug intrinsic. +valid debug record. .. note:: A DIExpression is interpreted in the same way regardless of which kind of - debug intrinsic it's attached to. + debug record it's attached to. + + DIExpressions are always printed and parsed inline; they can never be + referenced by an ID (e.g. ``!1``). .. code-block:: text - !0 = !DIExpression(DW_OP_deref) - !1 = !DIExpression(DW_OP_plus_uconst, 3) - !1 = !DIExpression(DW_OP_constu, 3, DW_OP_plus) - !2 = !DIExpression(DW_OP_bit_piece, 3, 7) - !3 = !DIExpression(DW_OP_deref, DW_OP_constu, 3, DW_OP_plus, DW_OP_LLVM_fragment, 3, 7) - !4 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef) - !5 = !DIExpression(DW_OP_constu, 42, DW_OP_stack_value) + !DIExpression(DW_OP_deref) + !DIExpression(DW_OP_plus_uconst, 3) + !DIExpression(DW_OP_constu, 3, DW_OP_plus) + !DIExpression(DW_OP_bit_piece, 3, 7) + !DIExpression(DW_OP_deref, DW_OP_constu, 3, DW_OP_plus, DW_OP_LLVM_fragment, 3, 7) + !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef) + !DIExpression(DW_OP_constu, 42, DW_OP_stack_value) DIAssignID """""""""" ``DIAssignID`` nodes have no operands and are always distinct. They are used to -link together `@llvm.dbg.assign` intrinsics (:ref:`debug -intrinsics`) and instructions that store in IR. See `Debug Info -Assignment Tracking `_ for more info. +link together (:ref:`#dbg_assign records `) and instructions +that store in IR. See `Debug Info Assignment Tracking +`_ for more info. .. code-block:: llvm store i32 %a, ptr %a.addr, align 4, !DIAssignID !2 - llvm.dbg.assign(metadata %a, metadata !1, metadata !DIExpression(), !2, metadata %a.addr, metadata !DIExpression()), !dbg !3 + #dbg_assign(%a, !1, !DIExpression(), !2, %a.addr, !DIExpression(), !3) !2 = distinct !DIAssignID() @@ -6468,17 +6472,18 @@ DIArgList also be updated to mirror whatever we decide here. ``DIArgList`` nodes hold a list of constant or SSA value references. These are -used in :ref:`debug intrinsics` (currently only in -``llvm.dbg.value``) in combination with a ``DIExpression`` that uses the +used in :ref:`debug records ` in combination with a +``DIExpression`` that uses the ``DW_OP_LLVM_arg`` operator. Because a DIArgList may refer to local values within a function, it must only be used as a function argument, must always be inlined, and cannot appear in named metadata. .. code-block:: text - llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %b), - metadata !16, - metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)) + #dbg_value(!DIArgList(i32 %a, i32 %b), + !16, + !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), + !26) DIFlags """"""" @@ -12957,12 +12962,12 @@ an extra level of indentation. As an example: #dbg_value(%inst1, !10, !DIExpression(), !11) %inst2 = op2 %inst1, %c -These debug records are an optional replacement for -:ref:`debug intrinsics`. Debug records will be output if the -``--write-experimental-debuginfo`` flag is passed to LLVM; it is an error for both -records and intrinsics to appear in the same module. More information about -debug records can be found in the `LLVM Source Level Debugging -`_ document. +These debug records replace the prior :ref:`debug intrinsics`. +Debug records will be disabled if ``--write-experimental-debuginfo=false`` is +passed to LLVM; it is an error for both records and intrinsics to appear in the +same module. More information about debug records can be found in the `LLVM +Source Level Debugging `_ +document. .. _intrinsics: diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst index ec29870128c1d1..b4b59dbfa8ca4e 100644 --- a/llvm/docs/MIRLangRef.rst +++ b/llvm/docs/MIRLangRef.rst @@ -883,16 +883,16 @@ Where: - ``debug-info-location`` identifies a DILocation metadata node. -These metadata attributes correspond to the operands of a ``llvm.dbg.declare`` -IR intrinsic, see the :ref:`source level debugging` -documentation. +These metadata attributes correspond to the operands of a ``#dbg_declare`` +IR debug record, see the :ref:`source level +debugging` documentation. Varying variable locations ^^^^^^^^^^^^^^^^^^^^^^^^^^ Variables that are not always on the stack or change location are specified with the ``DBG_VALUE`` meta machine instruction. It is synonymous with the -``llvm.dbg.value`` IR intrinsic, and is written: +``#dbg_value`` IR record, and is written: .. code-block:: text diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst index 2edad5cd3c881a..49f633e98d16fe 100644 --- a/llvm/docs/Passes.rst +++ b/llvm/docs/Passes.rst @@ -935,8 +935,9 @@ declarations and removes them. Dead declarations are declarations of functions for which no implementation is available (i.e., declarations for unused library functions). -``strip-debug-declare``: Strip all ``llvm.dbg.declare`` intrinsics ------------------------------------------------------------------- +``strip-debug-declare``: Strip all ``llvm.dbg.declare`` intrinsics and +``#dbg_declare`` records. +------------------------------------------------------------------- Performs code stripping. Similar to strip, but only strips ``llvm.dbg.declare`` intrinsics. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 00e2969ee3543b..50c43fef3ad128 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -66,6 +66,10 @@ Changes to the LLVM IR * ``icmp`` * ``fcmp`` +* LLVM has switched from using debug intrinsics in textual IR to using debug + records by default. Details of the change and instructions on how to update + any downstream tools and tests can be found in the `migration docs + `_. Changes to LLVM infrastructure ------------------------------ diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst index 7f7e595eb14dfe..0acc929856eb6d 100644 --- a/llvm/docs/SourceLevelDebugging.rst +++ b/llvm/docs/SourceLevelDebugging.rst @@ -43,7 +43,7 @@ important ones are: debuggers, like GDB or DBX. The approach used by the LLVM implementation is to use a small set of -:ref:`intrinsic functions ` to define a mapping +:ref:`debug records ` to define a mapping between LLVM program objects and the source-level objects. The description of the source-level program is maintained in LLVM metadata in an :ref:`implementation-defined format ` (the C/C++ front-end @@ -169,83 +169,99 @@ Debug information descriptors are `specialized metadata nodes There are two models for defining the values of source variables at different states of the program and tracking these values through optimization and code -generation: :ref:`intrinsic function calls `, the -current default, and :ref:`debug records `, which are a new -non-instruction-based model -(for an explanation of how this works and why it is desirable, see the -`RemoveDIs `_ document). Each module must use one or -the other; they may never be mixed within an IR module. To enable writing debug -records instead of intrinsic calls, use the flag -``--write-experimental-debuginfo``. +generation: :ref:`debug records `, the current default, and +:ref:`intrinsic function calls `, which are +non-default but currently supported for backwards compatibility - though these +two models must never be mixed within an IR module. For an explanation of why +we changed to the new model, how it works, and guidance on how to update old +code or IR to use debug records, see the `RemoveDIs `_ +document. -.. _format_common_intrinsics: +.. _debug_records: -Debugger intrinsic functions +Debug Records ---------------------------- -LLVM uses several intrinsic functions (name prefixed with "``llvm.dbg``") to -track source local variables through optimization and code generation. +Debug records define the value that a source variable has during execution of +the program; they appear interleaved with instructions, although they are not +instructions themselves and have no effect on the code generated by the +compiler. -``llvm.dbg.declare`` -^^^^^^^^^^^^^^^^^^^^ +LLVM uses several types of debug records to define source variables. The +common syntax for these records is: .. code-block:: llvm - void @llvm.dbg.declare(metadata, metadata, metadata) + #dbg_([, ]* ) + ; Using the intrinsic model, the above is equivalent to: + call void llvm.dbg.([metadata , ]*), !dbg + +Debug records are always printed with an extra level of indentation compared +to instructions, and always have the prefix `#dbg_` and a list of +comma-separated arguments in parentheses, as with a `call`. + +``#dbg_declare`` +^^^^^^^^^^^^^^^^ + +.. code-block:: llvm + + #dbg_declare([Value|MDNode], DILocalVariable, DIExpression, DILocation) -This intrinsic provides information about a local element (e.g., variable). -The first argument is metadata holding the address of variable, typically a -static alloca in the function entry block. The second argument is a +This record provides information about a local element (e.g., variable). +The first argument is an SSA value corresponding to a variable address, and is +typically a static alloca in the function entry block. The second argument is a `local variable `_ containing a description of the variable. The third argument is a `complex expression -`_. An `llvm.dbg.declare` intrinsic describes the +`_. The fourth argument is a `source location +`_. A ``#dbg_declare`` record describes the *address* of a source variable. -.. code-block:: text +.. code-block:: llvm %i.addr = alloca i32, align 4 - call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !1, - metadata !DIExpression()), !dbg !2 + #dbg_declare(ptr %i.addr, !1, !DIExpression(), !2) + ; ... !1 = !DILocalVariable(name: "i", ...) ; int i !2 = !DILocation(...) - ... + ; ... %buffer = alloca [256 x i8], align 8 ; The address of i is buffer+64. - call void @llvm.dbg.declare(metadata [256 x i8]* %buffer, metadata !3, - metadata !DIExpression(DW_OP_plus, 64)), !dbg !4 + #dbg_declare(ptr %buffer, !3, !DIExpression(DW_OP_plus, 64), !4) + ; ... !3 = !DILocalVariable(name: "i", ...) ; int i !4 = !DILocation(...) -A frontend should generate exactly one call to ``llvm.dbg.declare`` at the point +A frontend should generate exactly one ``#dbg_declare`` record at the point of declaration of a source variable. Optimization passes that fully promote the -variable from memory to SSA values will replace this call with possibly multiple -calls to `llvm.dbg.value`. Passes that delete stores are effectively partial -promotion, and they will insert a mix of calls to ``llvm.dbg.value`` to track -the source variable value when it is available. After optimization, there may be -multiple calls to ``llvm.dbg.declare`` describing the program points where the -variables lives in memory. All calls for the same concrete source variable must -agree on the memory location. +variable from memory to SSA values will replace this record with possibly +multiple ``#dbg_value``` records. Passes that delete stores are effectively +partial promotion, and they will insert a mix of ``#dbg_value`` records to +track the source variable value when it is available. After optimization, there +may be multiple ``#dbg_declare`` records describing the program points where +the variables lives in memory. All calls for the same concrete source variable +must agree on the memory location. -``llvm.dbg.value`` -^^^^^^^^^^^^^^^^^^ +``#dbg_value`` +^^^^^^^^^^^^^^ .. code-block:: llvm - void @llvm.dbg.value(metadata, metadata, metadata) + #dbg_value([Value|DIArgList|MDNode], DILocalVariable, DIExpression, DILocation) -This intrinsic provides information when a user source variable is set to a new -value. The first argument is the new value (wrapped as metadata). The second -argument is a `local variable `_ containing a -description of the variable. The third argument is a `complex expression -`_. +This record provides information when a user source variable is set to a new +value. The first argument is the new value. The second argument is a `local +variable `_ containing a description of the +variable. The third argument is a `complex expression +`_. The fourth argument is a `source location +`_. -An `llvm.dbg.value` intrinsic describes the *value* of a source variable +A ``#dbg_value`` record describes the *value* of a source variable directly, not its address. Note that the value operand of this intrinsic may be indirect (i.e, a pointer to the source variable), provided that interpreting the complex expression derives the direct value. -``llvm.dbg.assign`` +``#dbg_assign`` ^^^^^^^^^^^^^^^^^^^ .. toctree:: :hidden: @@ -254,85 +270,87 @@ the complex expression derives the direct value. .. code-block:: llvm - void @llvm.dbg.assign(Value *Value, - DIExpression *ValueExpression, - DILocalVariable *Variable, - DIAssignID *ID, - Value *Address, - DIExpression *AddressExpression) + #dbg_assign( [Value|DIArgList|MDNode] Value, + DILocalVariable Variable, + DIExpression ValueExpression, + DIAssignID ID, + [Value|MDNode] Address, + DIExpression AddressExpression, + DILocation SourceLocation ) -This intrinsic marks the position in IR where a source assignment occurred. It +This record marks the position in IR where a source assignment occurred. It encodes the value of the variable. It references the store, if any, that performs the assignment, and the destination address. -The first three arguments are the same as for an ``llvm.dbg.value``. The fourth +The first three arguments are the same as for a ``#dbg_value``. The fourth argument is a ``DIAssignID`` used to reference a store. The fifth is the -destination of the store (wrapped as metadata), and the sixth is a `complex -expression `_ that modifies it. +destination of the store, the sixth is a `complex +expression `_ that modfies it, and the seventh is a +`source location `_. -The formal LLVM-IR signature is: +See :doc:`AssignmentTracking` for more info. -.. code-block:: llvm +Debugger intrinsic functions +---------------------------- - void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) +.. _format_common_intrinsics: +In intrinsic-mode, LLVM uses several intrinsic functions (name prefixed with "``llvm.dbg``") to +track source local variables through optimization and code generation. These +intrinsic functions each correspond to one of the debug records above, with a +few syntactic differences: each argument to a debugger intrinsic must be wrapped +as metadata, meaning it must be prefixed with ``metadata``, and the +``DILocation`` argument in each record must be a metadata attachment to the +call instruction, meaning it appears after the argument list with the prefix +``!dbg``. -See :doc:`AssignmentTracking` for more info. +``llvm.dbg.declare`` +^^^^^^^^^^^^^^^^^^^^ -.. _debug_records: +.. code-block:: llvm -Debug Records ----------------------------- + void @llvm.dbg.declare(metadata, metadata, metadata) -LLVM also has an alternative to intrinsic functions, debug records, which -function similarly but are not instructions. The basic syntax for debug records -is: +This intrinsic is equivalent to ``#dbg_declare``: .. code-block:: llvm - #dbg_([, ]* ) - ; Using the intrinsic model, the above is equivalent to: - call void llvm.dbg.([metadata , ]*), !dbg + #dbg_declare(i32* %i.addr, !1, !DIExpression(), !2) + call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !1, + metadata !DIExpression()), !dbg !2 -A debug intrinsic function can be converted to a debug record with the -following steps: +``llvm.dbg.value`` +^^^^^^^^^^^^^^^^^^ -1. Add an extra level of indentation. -2. Replace everything prior to the intrinsic kind (declare/value/assign) with - ``#dbg_``. -3. Remove the leading ``metadata`` from the intrinsic's arguments. -4. Transfer the ``!dbg`` attachment to be an argument, dropping the leading - ``!dbg``. +.. code-block:: llvm -For each kind of intrinsic function, there is an equivalent debug record. + void @llvm.dbg.value(metadata, metadata, metadata) -``#dbg_declare`` -^^^^^^^^^^^^^^^^ +This intrinsic is equivalent to ``#dbg_value``: .. code-block:: llvm - #dbg_declare([Value|MDNode], DILocalVariable, DIExpression, DILocation) - -Equivalent to the ``llvm.dbg.declare`` intrinsic. + #dbg_value(i32 %i, !1, !DIExpression(), !2) + call void @llvm.dbg.value(metadata i32 %i, metadata !1, + metadata !DIExpression()), !dbg !2 -``#dbg_value`` -^^^^^^^^^^^^^^ +``llvm.dbg.assign`` +^^^^^^^^^^^^^^^^^^^ .. code-block:: llvm - #dbg_value([Value|DIArgList|MDNode], DILocalVariable, DIExpression, DILocation) - -Equivalent to the ``llvm.dbg.value`` intrinsic. + void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) -``#dbg_assign`` -^^^^^^^^^^^^^^^ +This intrinsic is equivalent to ``#dbg_assign``: .. code-block:: llvm - #dbg_assign([Value|DIArgList|MDNode], DILocalVariable, DIExpression, - DIAssignID, [Value|MDNode], DIExpression, DILocation) + #dbg_assign(i32 %i, !1, !DIExpression(), !2, + ptr %i.addr, !DIExpression(), !3) + call void @llvm.dbg.assign( + metadata i32 %i, metadata !1, metadata !DIExpression(), metadata !2, + metadata ptr %i.addr, metadata !DIExpression(), metadata !3), !dbg !3 -Equivalent to the ``llvm.dbg.assign`` intrinsic. Object lifetimes and scoping ============================ @@ -371,11 +389,11 @@ Compiled to LLVM, this function would be represented like this: %X = alloca i32, align 4 %Y = alloca i32, align 4 %Z = alloca i32, align 4 - call void @llvm.dbg.declare(metadata i32* %X, metadata !11, metadata !13), !dbg !14 + #dbg_declare(ptr %X, !11, !DIExpression(), !14) store i32 21, i32* %X, align 4, !dbg !14 - call void @llvm.dbg.declare(metadata i32* %Y, metadata !15, metadata !13), !dbg !16 + #dbg_declare(ptr %Y, !15, !DIExpression(), !16) store i32 22, i32* %Y, align 4, !dbg !16 - call void @llvm.dbg.declare(metadata i32* %Z, metadata !17, metadata !13), !dbg !19 + #dbg_declare(ptr %Z, !17, !DIExpression(), !19) store i32 23, i32* %Z, align 4, !dbg !19 %0 = load i32, i32* %X, align 4, !dbg !20 store i32 %0, i32* %Z, align 4, !dbg !21 @@ -384,9 +402,6 @@ Compiled to LLVM, this function would be represented like this: ret void, !dbg !24 } - ; Function Attrs: nounwind readnone - declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 - attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } @@ -407,33 +422,32 @@ Compiled to LLVM, this function would be represented like this: !10 = !{!"clang version 3.7.0 (trunk 231150) (llvm/trunk 231154)"} !11 = !DILocalVariable(name: "X", scope: !4, file: !1, line: 2, type: !12) !12 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) - !13 = !DIExpression() - !14 = !DILocation(line: 2, column: 9, scope: !4) - !15 = !DILocalVariable(name: "Y", scope: !4, file: !1, line: 3, type: !12) - !16 = !DILocation(line: 3, column: 9, scope: !4) - !17 = !DILocalVariable(name: "Z", scope: !18, file: !1, line: 5, type: !12) - !18 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5) - !19 = !DILocation(line: 5, column: 11, scope: !18) - !20 = !DILocation(line: 6, column: 11, scope: !18) - !21 = !DILocation(line: 6, column: 9, scope: !18) - !22 = !DILocation(line: 8, column: 9, scope: !4) - !23 = !DILocation(line: 8, column: 7, scope: !4) - !24 = !DILocation(line: 9, column: 3, scope: !4) + !13 = !DILocation(line: 2, column: 9, scope: !4) + !14 = !DILocalVariable(name: "Y", scope: !4, file: !1, line: 3, type: !12) + !15 = !DILocation(line: 3, column: 9, scope: !4) + !16 = !DILocalVariable(name: "Z", scope: !18, file: !1, line: 5, type: !12) + !17 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5) + !18 = !DILocation(line: 5, column: 11, scope: !18) + !29 = !DILocation(line: 6, column: 11, scope: !18) + !20 = !DILocation(line: 6, column: 9, scope: !18) + !21 = !DILocation(line: 8, column: 9, scope: !4) + !22 = !DILocation(line: 8, column: 7, scope: !4) + !23 = !DILocation(line: 9, column: 3, scope: !4) This example illustrates a few important details about LLVM debugging -information. In particular, it shows how the ``llvm.dbg.declare`` intrinsic and +information. In particular, it shows how the ``#dbg_declare`` record and location information, which are attached to an instruction, are applied together to allow a debugger to analyze the relationship between statements, variable definitions, and the code used to implement the function. .. code-block:: llvm - call void @llvm.dbg.declare(metadata i32* %X, metadata !11, metadata !13), !dbg !14 + #dbg_declare(ptr %X, !11, !DIExpression(), !14) ; [debug line = 2:7] [debug variable = X] -The first intrinsic ``%llvm.dbg.declare`` encodes debugging information for the -variable ``X``. The metadata ``!dbg !14`` attached to the intrinsic provides +The first record ``#dbg_declare`` encodes debugging information for the +variable ``X``. The location ``!14`` at the end of the record provides scope information for the variable ``X``. .. code-block:: text @@ -446,18 +460,18 @@ scope information for the variable ``X``. Here ``!14`` is metadata providing `location information `_. In this example, scope is encoded by ``!4``, a `subprogram descriptor `_. This way the location -information attached to the intrinsics indicates that the variable ``X`` is +information parameter to the records indicates that the variable ``X`` is declared at line number 2 at a function level scope in function ``foo``. Now lets take another example. .. code-block:: llvm - call void @llvm.dbg.declare(metadata i32* %Z, metadata !17, metadata !13), !dbg !19 + #dbg_declare(ptr %Z, !17, !DIExpression(), !19) ; [debug line = 5:9] [debug variable = Z] -The third intrinsic ``%llvm.dbg.declare`` encodes debugging information for -variable ``Z``. The metadata ``!dbg !19`` attached to the intrinsic provides +The third record ``#dbg_declare`` encodes debugging information for +variable ``Z``. The metadata ``!19`` at the end of the record provides scope information for the variable ``Z``. .. code-block:: text @@ -479,18 +493,18 @@ In the example above, every variable assignment uniquely corresponds to a memory store to the variable's position on the stack. However in heavily optimized code LLVM promotes most variables into SSA values, which can eventually be placed in physical registers or memory locations. To track SSA -values through compilation, when objects are promoted to SSA values an -``llvm.dbg.value`` intrinsic is created for each assignment, recording the -variable's new location. Compared with the ``llvm.dbg.declare`` intrinsic: +values through compilation, when objects are promoted to SSA values a +``#dbg_value`` record is created for each assignment, recording the +variable's new location. Compared with the ``#dbg_declare`` record: -* A dbg.value terminates the effect of any preceding dbg.values for (any +* A #dbg_value terminates the effect of any preceding #dbg_values for (any overlapping fragments of) the specified variable. -* The dbg.value's position in the IR defines where in the instruction stream +* The #dbg_value's position in the IR defines where in the instruction stream the variable's value changes. * Operands can be constants, indicating the variable is assigned a constant value. -Care must be taken to update ``llvm.dbg.value`` intrinsics when optimization +Care must be taken to update ``#dbg_value`` records when optimization passes alter or move instructions and blocks -- the developer could observe such changes reflected in the value of variables when debugging the program. For any execution of the optimized program, the set of variable values presented to the @@ -501,7 +515,7 @@ damaging their understanding of the optimized program and undermining their trust in the debugger. Sometimes perfectly preserving variable locations is not possible, often when a -redundant calculation is optimized out. In such cases, a ``llvm.dbg.value`` +redundant calculation is optimized out. In such cases, a ``#dbg_value`` with operand ``poison`` should be used, to terminate earlier variable locations and let the debugger present ``optimized out`` to the developer. Withholding these potentially stale variable values from the developer diminishes the @@ -514,26 +528,26 @@ To illustrate some potential issues, consider the following example: define i32 @foo(i32 %bar, i1 %cond) { entry: - call @llvm.dbg.value(metadata i32 0, metadata !1, metadata !2) + #dbg_value(i32 0, !1, !DIExpression(), !4) br i1 %cond, label %truebr, label %falsebr truebr: %tval = add i32 %bar, 1 - call @llvm.dbg.value(metadata i32 %tval, metadata !1, metadata !2) + #dbg_value(i32 %tval, !1, !DIExpression(), !4) %g1 = call i32 @gazonk() br label %exit falsebr: %fval = add i32 %bar, 2 - call @llvm.dbg.value(metadata i32 %fval, metadata !1, metadata !2) + #dbg_value(i32 %fval, !1, !DIExpression(), !4) %g2 = call i32 @gazonk() br label %exit exit: %merge = phi [ %tval, %truebr ], [ %fval, %falsebr ] %g = phi [ %g1, %truebr ], [ %g2, %falsebr ] - call @llvm.dbg.value(metadata i32 %merge, metadata !1, metadata !2) - call @llvm.dbg.value(metadata i32 %g, metadata !3, metadata !2) + #dbg_value(i32 %merge, !1, !DIExpression(), !4) + #dbg_value(i32 %g, !3, !DIExpression(), !4) %plusten = add i32 %merge, 10 %toret = add i32 %plusten, %g - call @llvm.dbg.value(metadata i32 %toret, metadata !1, metadata !2) + #dbg_value(i32 %toret, !1, !DIExpression(), !4) ret i32 %toret } @@ -551,48 +565,48 @@ perhaps, be optimized into the following code: ret i32 %toret } -What ``llvm.dbg.value`` intrinsics should be placed to represent the original variable +What ``#dbg_value`` records should be placed to represent the original variable locations in this code? Unfortunately the second, third and fourth -dbg.values for ``!1`` in the source function have had their operands +#dbg_values for ``!1`` in the source function have had their operands (%tval, %fval, %merge) optimized out. Assuming we cannot recover them, we -might consider this placement of dbg.values: +might consider this placement of #dbg_values: .. code-block:: llvm define i32 @foo(i32 %bar, i1 %cond) { entry: - call @llvm.dbg.value(metadata i32 0, metadata !1, metadata !2) + #dbg_value(i32 0, !1, !DIExpression(), !4) %g = call i32 @gazonk() - call @llvm.dbg.value(metadata i32 %g, metadata !3, metadata !2) + #dbg_value(i32 %g, !3, !DIExpression(), !4) %addoper = select i1 %cond, i32 11, i32 12 %plusten = add i32 %bar, %addoper %toret = add i32 %plusten, %g - call @llvm.dbg.value(metadata i32 %toret, metadata !1, metadata !2) + #dbg_value(i32 %toret, !1, !DIExpression(), !4) ret i32 %toret } However, this will cause ``!3`` to have the return value of ``@gazonk()`` at the same time as ``!1`` has the constant value zero -- a pair of assignments that never occurred in the unoptimized program. To avoid this, we must terminate -the range that ``!1`` has the constant value assignment by inserting a poison -dbg.value before the dbg.value for ``!3``: +the range that ``!1`` has the constant value assignment by inserting an poison +#dbg_value before the #dbg_value for ``!3``: .. code-block:: llvm define i32 @foo(i32 %bar, i1 %cond) { entry: - call @llvm.dbg.value(metadata i32 0, metadata !1, metadata !2) + #dbg_value(i32 0, !1, !DIExpression(), !2) %g = call i32 @gazonk() - call @llvm.dbg.value(metadata i32 poison, metadata !1, metadata !2) - call @llvm.dbg.value(metadata i32 %g, metadata !3, metadata !2) + #dbg_value(i32 poison, !1, !DIExpression(), !2) + #dbg_value(i32 %g, !3, !DIExpression(), !2) %addoper = select i1 %cond, i32 11, i32 12 %plusten = add i32 %bar, %addoper %toret = add i32 %plusten, %g - call @llvm.dbg.value(metadata i32 %toret, metadata !1, metadata !2) + #dbg_value(i32 %toret, !1, !DIExpression(), !2) ret i32 %toret } -There are a few other dbg.value configurations that mean it terminates +There are a few other #dbg_value configurations that mean it terminates dominating location definitions without adding a new location. The complete list is: @@ -602,16 +616,16 @@ list is: * There are no location operands (empty ``DIArgList``) and the ``DIExpression`` is empty. -This class of dbg.value that kills variable locations is called a "kill -dbg.value" or "kill location", and for legacy reasons the term "undef -dbg.value" may be used in existing code. The ``DbgVariableIntrinsic`` methods +This class of #dbg_value that kills variable locations is called a "kill +#dbg_value" or "kill location", and for legacy reasons the term "undef +#dbg_value" may be used in existing code. The ``DbgVariableIntrinsic`` methods ``isKillLocation`` and ``setKillLocation`` should be used where possible rather -than inspecting location operands directly to check or set whether a dbg.value +than inspecting location operands directly to check or set whether a #dbg_value is a kill location. -In general, if any dbg.value has its operand optimized out and cannot be -recovered, then a kill dbg.value is necessary to terminate earlier variable -locations. Additional kill dbg.values may be necessary when the debugger can +In general, if any #dbg_value has its operand optimized out and cannot be +recovered, then a kill #dbg_value is necessary to terminate earlier variable +locations. Additional kill #dbg_values may be necessary when the debugger can observe re-ordering of assignments. How variable location metadata is transformed during CodeGen @@ -622,9 +636,9 @@ ultimately producing a mapping between source-level information and instruction ranges. This is relatively straightforwards for line number information, as mapping instructions to line numbers is a simple association. For variable locations -however the story is more complex. As each ``llvm.dbg.value`` intrinsic +however the story is more complex. As each ``#dbg_value`` record represents a source-level assignment of a value to a source variable, the -variable location intrinsics effectively embed a small imperative program +debug records effectively embed a small imperative program within the LLVM IR. By the end of CodeGen, this becomes a mapping from each variable to their machine locations over ranges of instructions. From IR to object emission, the major transformations which affect variable @@ -639,7 +653,7 @@ significantly change the ordering of the program, and occurs in a number of different passes. Some variable locations are not transformed during CodeGen. Stack locations -specified by ``llvm.dbg.declare`` are valid and unchanging for the entire +specified by ``#dbg_declare`` are valid and unchanging for the entire duration of the function, and are recorded in a simple MachineFunction table. Location changes in the prologue and epilogue of a function are also ignored: frame setup and destruction may take several instructions, require a @@ -668,11 +682,11 @@ otherwise transformed into a non-register, the variable location becomes unavailable. Locations that are unavailable are treated as if they have been optimized out: -in IR the location would be assigned ``undef`` by a debug intrinsic, and in MIR +in IR the location would be assigned ``undef`` by a debug record, and in MIR the equivalent location is used. After MIR locations are assigned to each variable, machine pseudo-instructions -corresponding to each ``llvm.dbg.value`` intrinsic are inserted. There are two +corresponding to each ``#dbg_value`` record are inserted. There are two forms of this type of instruction. The first form, ``DBG_VALUE``, appears thus: @@ -684,14 +698,14 @@ The first form, ``DBG_VALUE``, appears thus: And has the following operands: * The first operand can record the variable location as a register, a frame index, an immediate, or the base address register if the original - debug intrinsic referred to memory. ``$noreg`` indicates the variable - location is undefined, equivalent to an ``undef`` dbg.value operand. + debug record referred to memory. ``$noreg`` indicates the variable + location is undefined, equivalent to an ``undef`` #dbg_value operand. * The type of the second operand indicates whether the variable location is directly referred to by the DBG_VALUE, or whether it is indirect. The ``$noreg`` register signifies the former, an immediate operand (0) the latter. - * Operand 3 is the Variable field of the original debug intrinsic. - * Operand 4 is the Expression field of the original debug intrinsic. + * Operand 3 is the Variable field of the original debug record. + * Operand 4 is the Expression field of the original debug record. The second form, ``DBG_VALUE_LIST``, appears thus: @@ -700,8 +714,8 @@ The second form, ``DBG_VALUE_LIST``, appears thus: DBG_VALUE_LIST !123, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), %1, %2 And has the following operands: - * The first operand is the Variable field of the original debug intrinsic. - * The second operand is the Expression field of the original debug intrinsic. + * The first operand is the Variable field of the original debug record. + * The second operand is the Expression field of the original debug record. * Any number of operands, from the 3rd onwards, record a sequence of variable location operands, which may take any of the same values as the first operand of the ``DBG_VALUE`` instruction above. These variable location @@ -710,7 +724,7 @@ And has the following operands: `_. The position at which the DBG_VALUEs are inserted should correspond to the -positions of their matching ``llvm.dbg.value`` intrinsics in the IR block. As +positions of their matching ``#dbg_value`` records in the IR block. As with optimization, LLVM aims to preserve the order in which variable assignments occurred in the source program. However SelectionDAG performs some instruction scheduling, which can reorder assignments (discussed below). @@ -724,20 +738,20 @@ the following example: define i32 @foo(i32* %addr) { entry: - call void @llvm.dbg.value(metadata i32 0, metadata !3, metadata !DIExpression()), !dbg !5 + #dbg_value(i32 0, !3, !DIExpression(), !5) br label %bb1, !dbg !5 bb1: ; preds = %bb1, %entry %bar.0 = phi i32 [ 0, %entry ], [ %add, %bb1 ] - call void @llvm.dbg.value(metadata i32 %bar.0, metadata !3, metadata !DIExpression()), !dbg !5 + #dbg_value(i32 %bar.0, !3, !DIExpression(), !5) %addr1 = getelementptr i32, i32 *%addr, i32 1, !dbg !5 - call void @llvm.dbg.value(metadata i32 *%addr1, metadata !3, metadata !DIExpression()), !dbg !5 + #dbg_value(i32 *%addr1, !3, !DIExpression(), !5) %loaded1 = load i32, i32* %addr1, !dbg !5 %addr2 = getelementptr i32, i32 *%addr, i32 %bar.0, !dbg !5 - call void @llvm.dbg.value(metadata i32 *%addr2, metadata !3, metadata !DIExpression()), !dbg !5 + #dbg_value(i32 *%addr2, !3, !DIExpression(), !5) %loaded2 = load i32, i32* %addr2, !dbg !5 %add = add i32 %bar.0, 1, !dbg !5 - call void @llvm.dbg.value(metadata i32 %add, metadata !3, metadata !DIExpression()), !dbg !5 + #dbg_value(i32 %add, !3, !DIExpression(), !5) %added = add i32 %loaded1, %loaded2 %cond = icmp ult i32 %added, %bar.0, !dbg !5 br i1 %cond, label %bb1, label %bb2, !dbg !5 @@ -779,12 +793,12 @@ If one compiles this IR with ``llc -o - -start-after=codegen-prepare -stop-after $eax = COPY %8, debug-location !5 RET 0, $eax, debug-location !5 -Observe first that there is a DBG_VALUE instruction for every ``llvm.dbg.value`` -intrinsic in the source IR, ensuring no source level assignments go missing. +Observe first that there is a DBG_VALUE instruction for every ``#dbg_value`` +record in the source IR, ensuring no source level assignments go missing. Then consider the different ways in which variable locations have been recorded: -* For the first dbg.value an immediate operand is used to record a zero value. -* The dbg.value of the PHI instruction leads to a DBG_VALUE of virtual register +* For the first #dbg_value an immediate operand is used to record a zero value. +* The #dbg_value of the PHI instruction leads to a DBG_VALUE of virtual register ``%0``. * The first GEP has its effect folded into the first load instruction (as a 4-byte offset), but the variable location is salvaged by folding @@ -792,7 +806,7 @@ Then consider the different ways in which variable locations have been recorded: * The second GEP is also folded into the corresponding load. However, it is insufficiently simple to be salvaged, and is emitted as a ``$noreg`` DBG_VALUE, indicating that the variable takes on an undefined location. -* The final dbg.value has its Value placed in virtual register ``%1``. +* The final #dbg_value has its Value placed in virtual register ``%1``. Instruction Scheduling ---------------------- @@ -899,14 +913,14 @@ presents several difficulties: br label %exit, !dbg !26 truebr: - call void @llvm.dbg.value(metadata i32 %input, metadata !30, metadata !DIExpression()), !dbg !24 - call void @llvm.dbg.value(metadata i32 1, metadata !23, metadata !DIExpression()), !dbg !24 + #dbg_value(i32 %input, !30, !DIExpression(), !24) + #dbg_value(i32 1, !23, !DIExpression(), !24) %value1 = add i32 %input, 1 br label %bb1 falsebr: - call void @llvm.dbg.value(metadata i32 %input, metadata !30, metadata !DIExpression()), !dbg !24 - call void @llvm.dbg.value(metadata i32 2, metadata !23, metadata !DIExpression()), !dbg !24 + #dbg_value(i32 %input, !30, !DIExpression(), !24) + #dbg_value(i32 2, !23, !DIExpression(), !24) %value2 = add i32 %input, 2 br label %bb1 @@ -920,13 +934,13 @@ Here the difficulties are: * The value of the ``!23`` variable merges into ``%bb1``, but there is no PHI node -As mentioned above, the ``llvm.dbg.value`` intrinsics essentially form an -imperative program embedded in the IR, with each intrinsic defining a variable +As mentioned above, the ``#dbg_value`` records essentially form an +imperative program embedded in the IR, with each record defining a variable location. This *could* be converted to an SSA form by mem2reg, in the same way that it uses use-def chains to identify control flow merges and insert phi nodes for IR Values. However, because debug variable locations are defined for every machine instruction, in effect every IR instruction uses every variable -location, which would lead to a large number of debugging intrinsics being +location, which would lead to a large number of debugging records being generated. Examining the example above, variable ``!30`` is assigned ``%input`` on both @@ -935,8 +949,8 @@ constant values on either path. Where control flow merges in ``%bb1`` we would want ``!30`` to keep its location (``%input``), but ``!23`` to become undefined as we cannot determine at runtime what value it should have in %bb1 without inserting a PHI node. mem2reg does not insert the PHI node to avoid changing -codegen when debugging is enabled, and does not insert the other dbg.values -to avoid adding very large numbers of intrinsics. +codegen when debugging is enabled, and does not insert the other #dbg_values +to avoid adding very large numbers of records. Instead, LiveDebugValues determines variable locations when control flow merges. A dataflow analysis is used to propagate locations between blocks: diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst index d9f11dd6d77900..c75d8814918aaa 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst @@ -416,7 +416,7 @@ argument allocas in ``FunctionAST::codegen``. Here we're first creating the variable, giving it the scope (``SP``), the name, source location, type, and since it's an argument, the argument -index. Next, we create an ``lvm.dbg.declare`` call to indicate at the IR +index. Next, we create a ``#dbg_declare`` record to indicate at the IR level that we've got a variable in an alloca (and it gives a starting location for the variable), and setting a source location for the beginning of the scope on the declare. From 2b15fb16cee05e6fe56edc6bc24d4d31df48a115 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 11 Jun 2024 15:26:53 +0200 Subject: [PATCH 70/82] [gold] Don't pass StringRef to message() (#95083) This is a printf style variadic function. If using a "%s" format, we should pass "const char *" rather than "StringRef". The use of data() here is safe because we know that the StringRef was originally derived from a null-terminated string. --- llvm/tools/gold/gold-plugin.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp index 265ebcbff5877e..0b175a3852e425 100644 --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -307,7 +307,8 @@ namespace options { } else if (opt.consume_front("opt-remarks-hotness-threshold=")) { auto ResultOrErr = remarks::parseHotnessThresholdOption(opt); if (!ResultOrErr) - message(LDPL_FATAL, "Invalid remarks hotness threshold: %s", opt); + message(LDPL_FATAL, "Invalid remarks hotness threshold: %s", + opt.data()); else RemarksHotnessThreshold = *ResultOrErr; } else if (opt.consume_front("opt-remarks-format=")) { @@ -319,7 +320,7 @@ namespace options { } else if (opt.consume_front("time-trace-granularity=")) { unsigned Granularity; if (opt.getAsInteger(10, Granularity)) - message(LDPL_FATAL, "Invalid time trace granularity: %s", opt); + message(LDPL_FATAL, "Invalid time trace granularity: %s", opt.data()); else time_trace_granularity = Granularity; } else { From 837dc542b1519df343e5a8f7b2718483530a4193 Mon Sep 17 00:00:00 2001 From: paperchalice Date: Tue, 11 Jun 2024 21:27:14 +0800 Subject: [PATCH 71/82] [CodeGen][NewPM] Split `MachineDominatorTree` into a concrete analysis result (#94571) Prepare for new pass manager version of `MachineDominatorTreeAnalysis`. We may need a machine dominator tree version of `DomTreeUpdater` to handle `SplitCriticalEdge` in some CodeGen passes. --- llvm/include/llvm/CodeGen/MachineDominators.h | 124 ++++++++++++------ .../llvm/CodeGen/MachineUniformityAnalysis.h | 2 +- llvm/include/llvm/InitializePasses.h | 2 +- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 3 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/EarlyIfConversion.cpp | 16 +-- llvm/lib/CodeGen/InlineSpiller.cpp | 16 +-- .../CodeGen/LazyMachineBlockFrequencyInfo.cpp | 3 +- llvm/lib/CodeGen/LiveDebugVariables.cpp | 4 +- llvm/lib/CodeGen/LiveIntervals.cpp | 4 +- llvm/lib/CodeGen/MIRSampleProfile.cpp | 6 +- llvm/lib/CodeGen/MachineBasicBlock.cpp | 6 +- llvm/lib/CodeGen/MachineCSE.cpp | 8 +- llvm/lib/CodeGen/MachineCombiner.cpp | 2 +- llvm/lib/CodeGen/MachineDominanceFrontier.cpp | 7 +- llvm/lib/CodeGen/MachineDominators.cpp | 73 ++++++----- llvm/lib/CodeGen/MachineLICM.cpp | 8 +- llvm/lib/CodeGen/MachineLoopInfo.cpp | 6 +- llvm/lib/CodeGen/MachinePipeliner.cpp | 6 +- llvm/lib/CodeGen/MachineRegionInfo.cpp | 6 +- llvm/lib/CodeGen/MachineScheduler.cpp | 10 +- llvm/lib/CodeGen/MachineSink.cpp | 6 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 9 +- llvm/lib/CodeGen/MachineVerifier.cpp | 4 +- llvm/lib/CodeGen/PHIElimination.cpp | 6 +- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 9 +- llvm/lib/CodeGen/PostRASchedulerList.cpp | 4 +- llvm/lib/CodeGen/PrologEpilogInserter.cpp | 4 +- llvm/lib/CodeGen/RegAllocBasic.cpp | 2 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 8 +- llvm/lib/CodeGen/RegAllocPBQP.cpp | 4 +- llvm/lib/CodeGen/ShrinkWrap.cpp | 8 +- llvm/lib/CodeGen/UnreachableBlockElim.cpp | 6 +- llvm/lib/CodeGen/XRayInstrumentation.cpp | 6 +- .../AArch64CleanupLocalDynamicTLSPass.cpp | 5 +- .../AArch64/AArch64ConditionOptimizer.cpp | 8 +- .../AArch64/AArch64ConditionalCompares.cpp | 8 +- .../GISel/AArch64PostLegalizerCombiner.cpp | 7 +- .../GISel/AArch64PreLegalizerCombiner.cpp | 7 +- .../AMDGPUGlobalISelDivergenceLowering.cpp | 7 +- .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 7 +- .../AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 7 +- .../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 7 +- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 7 +- .../AMDGPU/R600MachineCFGStructurizer.cpp | 8 +- .../AMDGPU/R600OptimizeVectorRegisters.cpp | 4 +- llvm/lib/Target/AMDGPU/R600Packetizer.cpp | 4 +- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 9 +- .../Target/AMDGPU/SILateBranchLowering.cpp | 8 +- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 5 +- llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 9 +- .../Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp | 8 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 +- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 7 +- llvm/lib/Target/ARC/ARCBranchFinalize.cpp | 2 +- llvm/lib/Target/ARC/ARCOptAddrMode.cpp | 8 +- llvm/lib/Target/ARM/ARMConstantIslandPass.cpp | 4 +- llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 8 +- .../ARM/MVETPAndVPTOptimisationsPass.cpp | 9 +- .../Target/CSKY/CSKYConstantIslandPass.cpp | 2 +- .../lib/Target/Hexagon/HexagonBitSimplify.cpp | 8 +- .../Target/Hexagon/HexagonConstExtenders.cpp | 8 +- .../Target/Hexagon/HexagonCopyHoisting.cpp | 4 +- .../lib/Target/Hexagon/HexagonEarlyIfConv.cpp | 6 +- .../Target/Hexagon/HexagonExpandCondsets.cpp | 8 +- .../Target/Hexagon/HexagonFrameLowering.cpp | 2 +- llvm/lib/Target/Hexagon/HexagonGenInsert.cpp | 8 +- .../Target/Hexagon/HexagonGenMemAbsolute.cpp | 7 +- .../Target/Hexagon/HexagonGenPredicate.cpp | 6 +- .../Target/Hexagon/HexagonHardwareLoops.cpp | 6 +- .../lib/Target/Hexagon/HexagonOptAddrMode.cpp | 6 +- llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp | 6 +- .../Target/Hexagon/HexagonVLIWPacketizer.cpp | 6 +- llvm/lib/Target/Mips/MipsOptimizePICCall.cpp | 5 +- .../Target/Mips/MipsPostLegalizerCombiner.cpp | 7 +- .../Target/PowerPC/PPCBranchCoalescing.cpp | 6 +- llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp | 6 +- llvm/lib/Target/PowerPC/PPCMIPeephole.cpp | 8 +- .../Target/PowerPC/PPCReduceCRLogicals.cpp | 4 +- llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp | 6 +- .../GISel/RISCVPostLegalizerCombiner.cpp | 7 +- .../RISCV/GISel/RISCVPreLegalizerCombiner.cpp | 7 +- llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp | 5 +- .../Target/WebAssembly/WebAssemblyCFGSort.cpp | 6 +- .../WebAssembly/WebAssemblyCFGStackify.cpp | 6 +- .../WebAssembly/WebAssemblyExceptionInfo.cpp | 6 +- .../WebAssemblyMemIntrinsicResults.cpp | 6 +- .../WebAssembly/WebAssemblyRegStackify.cpp | 6 +- llvm/lib/Target/X86/X86FlagsCopyLowering.cpp | 4 +- llvm/lib/Target/X86/X86InstrInfo.cpp | 5 +- .../X86LoadValueInjectionLoadHardening.cpp | 6 +- .../deltas/ReduceInstructionsMIR.cpp | 2 +- .../WebAssemblyExceptionInfoTest.cpp | 4 +- 93 files changed, 414 insertions(+), 336 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h index 30c18ef410fab3..7c7df8ca4c3810 100644 --- a/llvm/include/llvm/CodeGen/MachineDominators.h +++ b/llvm/include/llvm/CodeGen/MachineDominators.h @@ -24,6 +24,7 @@ #include "llvm/Support/GenericDomTreeConstruction.h" #include #include +#include namespace llvm { class AnalysisUsage; @@ -39,16 +40,39 @@ inline void DominatorTreeBase::addRoot( extern template class DomTreeNodeBase; extern template class DominatorTreeBase; // DomTree -extern template class DominatorTreeBase; // PostDomTree -using MachineDomTree = DomTreeBase; using MachineDomTreeNode = DomTreeNodeBase; +namespace DomTreeBuilder { +using MBBDomTree = DomTreeBase; +using MBBUpdates = ArrayRef>; +using MBBDomTreeGraphDiff = GraphDiff; + +extern template void Calculate(MBBDomTree &DT); +extern template void CalculateWithUpdates(MBBDomTree &DT, + MBBUpdates U); + +extern template void InsertEdge(MBBDomTree &DT, + MachineBasicBlock *From, + MachineBasicBlock *To); + +extern template void DeleteEdge(MBBDomTree &DT, + MachineBasicBlock *From, + MachineBasicBlock *To); + +extern template void ApplyUpdates(MBBDomTree &DT, + MBBDomTreeGraphDiff &, + MBBDomTreeGraphDiff *); + +extern template bool Verify(const MBBDomTree &DT, + MBBDomTree::VerificationLevel VL); +} // namespace DomTreeBuilder + //===------------------------------------- /// DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to /// compute a normal dominator tree. /// -class MachineDominatorTree : public MachineFunctionPass { +class MachineDominatorTree : public DomTreeBase { /// Helper structure used to hold all the basic blocks /// involved in the split of a critical edge. struct CriticalEdge { @@ -70,62 +94,55 @@ class MachineDominatorTree : public MachineFunctionPass { /// such as BB == elt.NewBB. mutable SmallSet NewBBs; - /// The DominatorTreeBase that is used to compute a normal dominator tree. - std::unique_ptr DT; - /// Apply all the recorded critical edges to the DT. /// This updates the underlying DT information in a way that uses /// the fast query path of DT as much as possible. + /// FIXME: This method should not be a const member! /// /// \post CriticalEdgesToSplit.empty(). void applySplitCriticalEdges() const; public: - static char ID; // Pass ID, replacement for typeid + using Base = DomTreeBase; - MachineDominatorTree(); - explicit MachineDominatorTree(MachineFunction &MF) : MachineFunctionPass(ID) { - calculate(MF); - } + MachineDominatorTree() = default; + explicit MachineDominatorTree(MachineFunction &MF) { calculate(MF); } - MachineDomTree &getBase() { - if (!DT) - DT.reset(new MachineDomTree()); + // FIXME: If there is an updater for MachineDominatorTree, + // migrate to this updater and remove these wrappers. + + MachineDominatorTree &getBase() { applySplitCriticalEdges(); - return *DT; + return *this; } - void getAnalysisUsage(AnalysisUsage &AU) const override; - MachineBasicBlock *getRoot() const { applySplitCriticalEdges(); - return DT->getRoot(); + return Base::getRoot(); } MachineDomTreeNode *getRootNode() const { applySplitCriticalEdges(); - return DT->getRootNode(); + return const_cast(Base::getRootNode()); } - bool runOnMachineFunction(MachineFunction &F) override; - void calculate(MachineFunction &F); bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const { applySplitCriticalEdges(); - return DT->dominates(A, B); + return Base::dominates(A, B); } void getDescendants(MachineBasicBlock *A, SmallVectorImpl &Result) { applySplitCriticalEdges(); - DT->getDescendants(A, Result); + Base::getDescendants(A, Result); } bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const { applySplitCriticalEdges(); - return DT->dominates(A, B); + return Base::dominates(A, B); } // dominates - Return true if A dominates B. This performs the @@ -133,7 +150,8 @@ class MachineDominatorTree : public MachineFunctionPass { bool dominates(const MachineInstr *A, const MachineInstr *B) const { applySplitCriticalEdges(); const MachineBasicBlock *BBA = A->getParent(), *BBB = B->getParent(); - if (BBA != BBB) return DT->dominates(BBA, BBB); + if (BBA != BBB) + return Base::dominates(BBA, BBB); // Loop through the basic block until we find A or B. MachineBasicBlock::const_iterator I = BBA->begin(); @@ -146,13 +164,13 @@ class MachineDominatorTree : public MachineFunctionPass { bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const { applySplitCriticalEdges(); - return DT->properlyDominates(A, B); + return Base::properlyDominates(A, B); } bool properlyDominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const { applySplitCriticalEdges(); - return DT->properlyDominates(A, B); + return Base::properlyDominates(A, B); } /// findNearestCommonDominator - Find nearest common dominator basic block @@ -160,12 +178,12 @@ class MachineDominatorTree : public MachineFunctionPass { MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B) { applySplitCriticalEdges(); - return DT->findNearestCommonDominator(A, B); + return Base::findNearestCommonDominator(A, B); } MachineDomTreeNode *operator[](MachineBasicBlock *BB) const { applySplitCriticalEdges(); - return DT->getNode(BB); + return Base::getNode(BB); } /// getNode - return the (Post)DominatorTree node for the specified basic @@ -173,7 +191,7 @@ class MachineDominatorTree : public MachineFunctionPass { /// MachineDomTreeNode *getNode(MachineBasicBlock *BB) const { applySplitCriticalEdges(); - return DT->getNode(BB); + return Base::getNode(BB); } /// addNewBlock - Add a new node to the dominator tree information. This @@ -182,7 +200,7 @@ class MachineDominatorTree : public MachineFunctionPass { MachineDomTreeNode *addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB) { applySplitCriticalEdges(); - return DT->addNewBlock(BB, DomBB); + return Base::addNewBlock(BB, DomBB); } /// changeImmediateDominator - This method is used to update the dominator @@ -191,13 +209,13 @@ class MachineDominatorTree : public MachineFunctionPass { void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom) { applySplitCriticalEdges(); - DT->changeImmediateDominator(N, NewIDom); + Base::changeImmediateDominator(N, NewIDom); } void changeImmediateDominator(MachineDomTreeNode *N, MachineDomTreeNode *NewIDom) { applySplitCriticalEdges(); - DT->changeImmediateDominator(N, NewIDom); + Base::changeImmediateDominator(N, NewIDom); } /// eraseNode - Removes a node from the dominator tree. Block must not @@ -205,29 +223,23 @@ class MachineDominatorTree : public MachineFunctionPass { /// children list. Deletes dominator node associated with basic block BB. void eraseNode(MachineBasicBlock *BB) { applySplitCriticalEdges(); - DT->eraseNode(BB); + Base::eraseNode(BB); } /// splitBlock - BB is split and now it has one successor. Update dominator /// tree to reflect this change. void splitBlock(MachineBasicBlock* NewBB) { applySplitCriticalEdges(); - DT->splitBlock(NewBB); + Base::splitBlock(NewBB); } /// isReachableFromEntry - Return true if A is dominated by the entry /// block of the function containing it. bool isReachableFromEntry(const MachineBasicBlock *A) { applySplitCriticalEdges(); - return DT->isReachableFromEntry(A); + return Base::isReachableFromEntry(A); } - void releaseMemory() override; - - void verifyAnalysis() const override; - - void print(raw_ostream &OS, const Module*) const override; - /// Record that the critical edge (FromBB, ToBB) has been /// split with NewBB. /// This is best to use this method instead of directly update the @@ -251,6 +263,34 @@ class MachineDominatorTree : public MachineFunctionPass { } }; +/// \brief Analysis pass which computes a \c MachineDominatorTree. +class MachineDominatorTreeWrapperPass : public MachineFunctionPass { + // MachineFunctionPass may verify the analysis result without running pass, + // e.g. when `F.hasAvailableExternallyLinkage` is true. + std::optional DT; + +public: + static char ID; + + MachineDominatorTreeWrapperPass(); + + MachineDominatorTree &getDomTree() { return *DT; } + const MachineDominatorTree &getDomTree() const { return *DT; } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void verifyAnalysis() const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void releaseMemory() override; + + void print(raw_ostream &OS, const Module *M = nullptr) const override; +}; + //===------------------------------------- /// DominatorTree GraphTraits specialization so the DominatorTree can be /// iterable by generic graph iterators. diff --git a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h index 1039ac4e5189b3..a9b5eaf41c3f8d 100644 --- a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h +++ b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h @@ -30,7 +30,7 @@ using MachineUniformityInfo = GenericUniformityInfo; /// everything is uniform. MachineUniformityInfo computeMachineUniformityInfo( MachineFunction &F, const MachineCycleInfo &cycleInfo, - const MachineDomTree &domTree, bool HasBranchDivergence); + const MachineDominatorTree &domTree, bool HasBranchDivergence); /// Legacy analysis pass which computes a \ref MachineUniformityInfo. class MachineUniformityAnalysisPass : public MachineFunctionPass { diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 8803ef5a90e6e4..ee13735ef3257c 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -190,7 +190,7 @@ void initializeMachineCopyPropagationPass(PassRegistry&); void initializeMachineCycleInfoPrinterPassPass(PassRegistry &); void initializeMachineCycleInfoWrapperPassPass(PassRegistry &); void initializeMachineDominanceFrontierPass(PassRegistry&); -void initializeMachineDominatorTreePass(PassRegistry&); +void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeMachineFunctionSplitterPass(PassRegistry &); void initializeMachineLateInstrsCleanupPass(PassRegistry&); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 2943b270cd5df1..3580d484b7ddd6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1728,7 +1728,8 @@ void AsmPrinter::emitFunctionBody() { if (isVerbose()) { // Get MachineDominatorTree or compute it on the fly if it's unavailable - MDT = getAnalysisIfAvailable(); + auto MDTWrapper = getAnalysisIfAvailable(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; if (!MDT) { OwnedMDT = std::make_unique(); OwnedMDT->getBase().recalculate(*MF); diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 544f1b7f593531..b9093208aad588 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -80,7 +80,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineCopyPropagationPass(Registry); initializeMachineCycleInfoPrinterPassPass(Registry); initializeMachineCycleInfoWrapperPassPass(Registry); - initializeMachineDominatorTreePass(Registry); + initializeMachineDominatorTreeWrapperPassPass(Registry); initializeMachineFunctionPrinterPassPass(Registry); initializeMachineLateInstrsCleanupPass(Registry); initializeMachineLICMPass(Registry); diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index 2a7bee1618deb2..30480e598acef8 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -790,15 +790,15 @@ char &llvm::EarlyIfConverterID = EarlyIfConverter::ID; INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE, "Early If Converter", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE, "Early If Converter", false, false) void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -1089,7 +1089,7 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) { TRI = STI.getRegisterInfo(); SchedModel = STI.getSchedModel(); MRI = &MF.getRegInfo(); - DomTree = &getAnalysis(); + DomTree = &getAnalysis().getDomTree(); Loops = &getAnalysis(); Traces = &getAnalysis(); MinInstr = nullptr; @@ -1144,15 +1144,15 @@ char &llvm::EarlyIfPredicatorID = EarlyIfPredicator::ID; INITIALIZE_PASS_BEGIN(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_END(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false, false) void EarlyIfPredicator::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -1223,7 +1223,7 @@ bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) { TRI = STI.getRegisterInfo(); MRI = &MF.getRegInfo(); SchedModel.init(&STI); - DomTree = &getAnalysis(); + DomTree = &getAnalysis().getDomTree(); Loops = &getAnalysis(); MBPI = &getAnalysis(); diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 69c671220db353..98a0150a84e315 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -135,8 +135,8 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate { VirtRegMap &vrm) : MF(mf), LIS(pass.getAnalysis()), LSS(pass.getAnalysis()), - MDT(pass.getAnalysis()), VRM(vrm), - MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), + MDT(pass.getAnalysis().getDomTree()), + VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()), TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(pass.getAnalysis()), IPA(LIS, mf.getNumBlockIDs()) {} @@ -192,8 +192,8 @@ class InlineSpiller : public Spiller { VirtRegAuxInfo &VRAI) : MF(MF), LIS(Pass.getAnalysis()), LSS(Pass.getAnalysis()), - MDT(Pass.getAnalysis()), VRM(VRM), - MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), + MDT(Pass.getAnalysis().getDomTree()), + VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()), TRI(*MF.getSubtarget().getRegisterInfo()), MBFI(Pass.getAnalysis()), HSpiller(Pass, MF, VRM), VRAI(VRAI) {} @@ -1381,7 +1381,7 @@ void HoistSpillHelper::rmRedundantSpills( // earlier spill with smaller SlotIndex. for (auto *const CurrentSpill : Spills) { MachineBasicBlock *Block = CurrentSpill->getParent(); - MachineDomTreeNode *Node = MDT.getBase().getNode(Block); + MachineDomTreeNode *Node = MDT.getNode(Block); MachineInstr *PrevSpill = SpillBBToSpill[Node]; if (PrevSpill) { SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill); @@ -1389,9 +1389,9 @@ void HoistSpillHelper::rmRedundantSpills( MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill; MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill; SpillsToRm.push_back(SpillToRm); - SpillBBToSpill[MDT.getBase().getNode(Block)] = SpillToKeep; + SpillBBToSpill[MDT.getNode(Block)] = SpillToKeep; } else { - SpillBBToSpill[MDT.getBase().getNode(Block)] = CurrentSpill; + SpillBBToSpill[MDT.getNode(Block)] = CurrentSpill; } } for (auto *const SpillToRm : SpillsToRm) @@ -1465,7 +1465,7 @@ void HoistSpillHelper::getVisitOrders( // Sort the nodes in WorkSet in top-down order and save the nodes // in Orders. Orders will be used for hoisting in runHoistSpills. unsigned idx = 0; - Orders.push_back(MDT.getBase().getNode(Root)); + Orders.push_back(MDT.getNode(Root)); do { MachineDomTreeNode *Node = Orders[idx++]; for (MachineDomTreeNode *Child : Node->children()) { diff --git a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp index 39b44b917d9e37..721b75900c8ef0 100644 --- a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp +++ b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp @@ -64,7 +64,8 @@ LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const { auto &MBPI = getAnalysis(); auto *MLI = getAnalysisIfAvailable(); - auto *MDT = getAnalysisIfAvailable(); + auto *MDTWrapper = getAnalysisIfAvailable(); + auto *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; LLVM_DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n"); LLVM_DEBUG(if (MLI) dbgs() << "LoopInfo is available\n"); diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index 3a59ae7ab06644..16d8e916ce6682 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -78,13 +78,13 @@ char LiveDebugVariables::ID = 0; INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE, "Debug Variable Analysis", false, false) void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); + AU.addRequired(); AU.addRequiredTransitive(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 42c769399a1401..f9162b444e03d8 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -61,7 +61,7 @@ char LiveIntervals::ID = 0; char &llvm::LiveIntervalsID = LiveIntervals::ID; INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_END(LiveIntervals, "liveintervals", "Live Interval Analysis", false, false) @@ -123,7 +123,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) { TRI = MF->getSubtarget().getRegisterInfo(); TII = MF->getSubtarget().getInstrInfo(); Indexes = &getAnalysis(); - DomTree = &getAnalysis(); + DomTree = &getAnalysis().getDomTree(); if (!LICalc) LICalc = new LiveIntervalCalc(); diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp index 6faa1ad1a7790e..138cc567487626 100644 --- a/llvm/lib/CodeGen/MIRSampleProfile.cpp +++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -70,7 +70,7 @@ INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE, "Load MIR Sample Profile", /* cfg = */ false, /* is_analysis = */ false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) @@ -365,7 +365,7 @@ bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) { << MF.getFunction().getName() << "\n"); MBFI = &getAnalysis(); MIRSampleLoader->setInitVals( - &getAnalysis(), + &getAnalysis().getDomTree(), &getAnalysis(), &getAnalysis(), MBFI, &getAnalysis().getORE()); @@ -400,7 +400,7 @@ bool MIRProfileLoaderPass::doInitialization(Module &M) { void MIRProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequiredTransitive(); AU.addRequired(); diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 0bd5f09564ec0c..16505f21f0aadc 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -1330,9 +1330,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs); } - if (MachineDominatorTree *MDT = - P.getAnalysisIfAvailable()) - MDT->recordSplitCriticalEdge(this, Succ, NMBB); + if (auto *MDTWrapper = + P.getAnalysisIfAvailable()) + MDTWrapper->getDomTree().recordSplitCriticalEdge(this, Succ, NMBB); if (MachineLoopInfo *MLI = P.getAnalysisIfAvailable()) if (MachineLoop *TIL = MLI->getLoopFor(this)) { diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index 42cdcaa5bbf4f2..4e6101f8755897 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -92,8 +92,8 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addPreservedID(MachineLoopInfoID); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); } @@ -166,7 +166,7 @@ char &llvm::MachineCSEID = MachineCSE::ID; INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE, "Machine Common Subexpression Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, "Machine Common Subexpression Elimination", false, false) @@ -943,7 +943,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); AA = &getAnalysis().getAAResults(); - DT = &getAnalysis(); + DT = &getAnalysis().getDomTree(); MBFI = &getAnalysis(); LookAheadLimit = TII->getMachineCSELookAheadLimit(); bool ChangedPRE, ChangedCSE; diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index c11263163a34ff..3bd3b8a386b419 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -139,7 +139,7 @@ INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner", void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addPreserved(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); diff --git a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp index 346cfedde390d3..6a8ede4feb9378 100644 --- a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp +++ b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp @@ -26,7 +26,7 @@ char MachineDominanceFrontier::ID = 0; INITIALIZE_PASS_BEGIN(MachineDominanceFrontier, "machine-domfrontier", "Machine Dominance Frontier Construction", true, true) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier", "Machine Dominance Frontier Construction", true, true) @@ -38,7 +38,8 @@ char &llvm::MachineDominanceFrontierID = MachineDominanceFrontier::ID; bool MachineDominanceFrontier::runOnMachineFunction(MachineFunction &) { releaseMemory(); - Base.analyze(getAnalysis().getBase()); + Base.analyze( + getAnalysis().getDomTree().getBase()); return false; } @@ -48,6 +49,6 @@ void MachineDominanceFrontier::releaseMemory() { void MachineDominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp index 0632cde9c6f4e4..bdde8a4b435700 100644 --- a/llvm/lib/CodeGen/MachineDominators.cpp +++ b/llvm/lib/CodeGen/MachineDominators.cpp @@ -37,51 +37,59 @@ static cl::opt VerifyMachineDomInfoX( namespace llvm { template class DomTreeNodeBase; template class DominatorTreeBase; // DomTreeBase -} -char MachineDominatorTree::ID = 0; +namespace DomTreeBuilder { +template void Calculate(MBBDomTree &DT); +template void CalculateWithUpdates(MBBDomTree &DT, MBBUpdates U); -INITIALIZE_PASS(MachineDominatorTree, "machinedomtree", - "MachineDominator Tree Construction", true, true) +template void InsertEdge(MBBDomTree &DT, MachineBasicBlock *From, + MachineBasicBlock *To); -char &llvm::MachineDominatorsID = MachineDominatorTree::ID; +template void DeleteEdge(MBBDomTree &DT, MachineBasicBlock *From, + MachineBasicBlock *To); -void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); +template void ApplyUpdates(MBBDomTree &DT, MBBDomTreeGraphDiff &, + MBBDomTreeGraphDiff *); + +template bool Verify(const MBBDomTree &DT, + MBBDomTree::VerificationLevel VL); +} // namespace DomTreeBuilder } -bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) { - calculate(F); - return false; +char MachineDominatorTreeWrapperPass::ID = 0; + +INITIALIZE_PASS(MachineDominatorTreeWrapperPass, "machinedomtree", + "MachineDominator Tree Construction", true, true) + +MachineDominatorTreeWrapperPass::MachineDominatorTreeWrapperPass() + : MachineFunctionPass(ID) { + initializeMachineDominatorTreeWrapperPassPass( + *PassRegistry::getPassRegistry()); } void MachineDominatorTree::calculate(MachineFunction &F) { CriticalEdgesToSplit.clear(); NewBBs.clear(); - DT.reset(new DomTreeBase()); - DT->recalculate(F); + recalculate(F); } -MachineDominatorTree::MachineDominatorTree() - : MachineFunctionPass(ID) { - initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); -} +char &llvm::MachineDominatorsID = MachineDominatorTreeWrapperPass::ID; -void MachineDominatorTree::releaseMemory() { - CriticalEdgesToSplit.clear(); - DT.reset(nullptr); +bool MachineDominatorTreeWrapperPass::runOnMachineFunction(MachineFunction &F) { + DT = MachineDominatorTree(F); + return false; } -void MachineDominatorTree::verifyAnalysis() const { - if (DT && VerifyMachineDomInfo) - if (!DT->verify(MachineDomTree::VerificationLevel::Basic)) { - errs() << "MachineDominatorTree verification failed\n"; - abort(); - } +void MachineDominatorTreeWrapperPass::releaseMemory() { DT.reset(); } + +void MachineDominatorTreeWrapperPass::verifyAnalysis() const { + if (VerifyMachineDomInfo && DT) + if (!DT->verify(MachineDominatorTree::VerificationLevel::Basic)) + report_fatal_error("MachineDominatorTree verification failed!"); } -void MachineDominatorTree::print(raw_ostream &OS, const Module*) const { +void MachineDominatorTreeWrapperPass::print(raw_ostream &OS, + const Module *) const { if (DT) DT->print(OS); } @@ -103,7 +111,7 @@ void MachineDominatorTree::applySplitCriticalEdges() const { for (CriticalEdge &Edge : CriticalEdgesToSplit) { // Update dominator information. MachineBasicBlock *Succ = Edge.ToBB; - MachineDomTreeNode *SuccDTNode = DT->getNode(Succ); + MachineDomTreeNode *SuccDTNode = Base::getNode(Succ); for (MachineBasicBlock *PredBB : Succ->predecessors()) { if (PredBB == Edge.NewBB) @@ -126,7 +134,7 @@ void MachineDominatorTree::applySplitCriticalEdges() const { "than one predecessor!"); PredBB = *PredBB->pred_begin(); } - if (!DT->dominates(SuccDTNode, DT->getNode(PredBB))) { + if (!Base::dominates(SuccDTNode, Base::getNode(PredBB))) { IsNewIDom[Idx] = false; break; } @@ -138,13 +146,16 @@ void MachineDominatorTree::applySplitCriticalEdges() const { Idx = 0; for (CriticalEdge &Edge : CriticalEdgesToSplit) { // We know FromBB dominates NewBB. - MachineDomTreeNode *NewDTNode = DT->addNewBlock(Edge.NewBB, Edge.FromBB); + MachineDomTreeNode *NewDTNode = + const_cast(this)->Base::addNewBlock( + Edge.NewBB, Edge.FromBB); // If all the other predecessors of "Succ" are dominated by "Succ" itself // then the new block is the new immediate dominator of "Succ". Otherwise, // the new block doesn't dominate anything. if (IsNewIDom[Idx]) - DT->changeImmediateDominator(DT->getNode(Edge.ToBB), NewDTNode); + const_cast(this)->Base::changeImmediateDominator( + Base::getNode(Edge.ToBB), NewDTNode); ++Idx; } NewBBs.clear(); diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 9cc6d9b9fa715f..edf8988512c78d 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -191,7 +191,7 @@ namespace { AU.addRequired(); if (DisableHoistingToHotterBlocks != UseBFI::None) AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -325,7 +325,7 @@ INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE, "Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE, "Machine Loop Invariant Code Motion", false, false) @@ -334,7 +334,7 @@ INITIALIZE_PASS_BEGIN(EarlyMachineLICM, "early-machinelicm", "Early Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(EarlyMachineLICM, "early-machinelicm", "Early Machine Loop Invariant Code Motion", false, false) @@ -375,7 +375,7 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) { if (DisableHoistingToHotterBlocks != UseBFI::None) MBFI = &getAnalysis(); MLI = &getAnalysis(); - DT = &getAnalysis(); + DT = &getAnalysis().getDomTree(); AA = &getAnalysis().getAAResults(); if (HoistConstLoads) diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp index 1019c53e57c6fb..9fb103945838ad 100644 --- a/llvm/lib/CodeGen/MachineLoopInfo.cpp +++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -36,14 +36,14 @@ MachineLoopInfo::MachineLoopInfo() : MachineFunctionPass(ID) { } INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops", "Machine Natural Loop Construction", true, true) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MachineLoopInfo, "machine-loops", "Machine Natural Loop Construction", true, true) char &llvm::MachineLoopInfoID = MachineLoopInfo::ID; bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) { - calculate(getAnalysis()); + calculate(getAnalysis().getDomTree()); return false; } @@ -54,7 +54,7 @@ void MachineLoopInfo::calculate(MachineDominatorTree &MDT) { void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 4f7d9d070cee6f..32f65f0d491398 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -219,7 +219,7 @@ INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) @@ -248,7 +248,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MLI = &getAnalysis(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); ORE = &getAnalysis().getORE(); TII = MF->getSubtarget().getInstrInfo(); RegClassInfo.runOnMachineFunction(*MF); @@ -481,7 +481,7 @@ void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/CodeGen/MachineRegionInfo.cpp b/llvm/lib/CodeGen/MachineRegionInfo.cpp index 45cdcbfeab9f12..d496b0c182c76e 100644 --- a/llvm/lib/CodeGen/MachineRegionInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegionInfo.cpp @@ -84,7 +84,7 @@ MachineRegionInfoPass::~MachineRegionInfoPass() = default; bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) { releaseMemory(); - auto DT = &getAnalysis(); + auto DT = &getAnalysis().getDomTree(); auto PDT = &getAnalysis(); auto DF = &getAnalysis(); @@ -109,7 +109,7 @@ void MachineRegionInfoPass::verifyAnalysis() const { void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -130,7 +130,7 @@ char &MachineRegionInfoPassID = MachineRegionInfoPass::ID; INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE, "Detect single entry single exit regions", true, true) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE, diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 03e892a5e0d225..cf72f743808350 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -266,7 +266,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID; INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) @@ -279,7 +279,7 @@ MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) { void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -296,7 +296,7 @@ char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID; INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched", "PostRA Machine Instruction Scheduler", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(PostMachineScheduler, "postmisched", @@ -308,7 +308,7 @@ PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) { void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -445,7 +445,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { // Initialize the context of the pass. MF = &mf; MLI = &getAnalysis(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); PassConfig = &getAnalysis(); AA = &getAnalysis().getAAResults(); diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 3d40130b92c443..dcfa389e9bf416 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -184,7 +184,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -274,7 +274,7 @@ char &llvm::MachineSinkingID = MachineSinking::ID; INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE, "Machine code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE, @@ -708,7 +708,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) { TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); MRI = &MF.getRegInfo(); - DT = &getAnalysis(); + DT = &getAnalysis().getDomTree(); PDT = &getAnalysis(); CI = &getAnalysis().getCycleInfo(); MBFI = UseBlockFreqInfo ? &getAnalysis() : nullptr; diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp index 131138e0649e4c..7548fc8141ec56 100644 --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -155,7 +155,7 @@ template struct llvm::GenericUniformityAnalysisImplDeleter< MachineUniformityInfo llvm::computeMachineUniformityInfo( MachineFunction &F, const MachineCycleInfo &cycleInfo, - const MachineDomTree &domTree, bool HasBranchDivergence) { + const MachineDominatorTree &domTree, bool HasBranchDivergence) { assert(F.getRegInfo().isSSA() && "Expected to be run on SSA form!"); MachineUniformityInfo UI(domTree, cycleInfo); if (HasBranchDivergence) @@ -187,19 +187,20 @@ MachineUniformityAnalysisPass::MachineUniformityAnalysisPass() INITIALIZE_PASS_BEGIN(MachineUniformityAnalysisPass, "machine-uniformity", "Machine Uniformity Info Analysis", true, true) INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity", "Machine Uniformity Info Analysis", true, true) void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } bool MachineUniformityAnalysisPass::runOnMachineFunction(MachineFunction &MF) { - auto &DomTree = getAnalysis().getBase(); + auto &DomTree = + getAnalysis().getDomTree().getBase(); auto &CI = getAnalysis().getCycleInfo(); // FIXME: Query TTI::hasBranchDivergence. -run-pass seems to end up with a // default NoTTI diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 0744089486313d..9ea238c61ed91e 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -226,7 +226,7 @@ namespace { // This is calculated only when trying to verify convergence control tokens. // Similar to the LLVM IR verifier, we calculate this locally instead of // relying on the pass manager. - MachineDomTree DT; + MachineDominatorTree DT; void visitMachineFunctionBefore(); void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); @@ -3177,7 +3177,7 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) { } static void -verifyConvergenceControl(const MachineFunction &MF, MachineDomTree &DT, +verifyConvergenceControl(const MachineFunction &MF, MachineDominatorTree &DT, std::function FailureCB) { MachineConvergenceVerifier CV; CV.initialize(&errs(), FailureCB, MF); diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index 3254ec0b77fe78..592972f5c83b2b 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -139,7 +139,7 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -216,8 +216,8 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) { // TODO: we should use the incremental DomTree updater here. if (Changed) - if (auto *MDT = getAnalysisIfAvailable()) - MDT->getBase().recalculate(MF); + if (auto *MDT = getAnalysisIfAvailable()) + MDT->getDomTree().getBase().recalculate(MF); LoweredPHIs.clear(); ImpDefs.clear(); diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 477a86dbe3f8c4..e6fe7a070f2a50 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -172,8 +172,8 @@ namespace { AU.addRequired(); AU.addPreserved(); if (Aggressive) { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } } @@ -487,7 +487,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID; INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) @@ -1670,7 +1670,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); - DT = Aggressive ? &getAnalysis() : nullptr; + DT = Aggressive ? &getAnalysis().getDomTree() + : nullptr; MLI = &getAnalysis(); MF.setDelegate(this); diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp index ffd70a29f17150..8005050d5215ac 100644 --- a/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -85,8 +85,8 @@ namespace { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 6a72797de493d4..ca54e88177e988 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -151,7 +151,7 @@ char &llvm::PrologEpilogCodeInserterID = PEI::ID; INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) INITIALIZE_PASS_END(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion & Frame Finalization", false, @@ -167,7 +167,7 @@ STATISTIC(NumBytesStackSpace, void PEI::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index 5bd3b126aa1666..181337ca4d60f0 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -136,7 +136,7 @@ INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer) INITIALIZE_PASS_DEPENDENCY(MachineScheduler) INITIALIZE_PASS_DEPENDENCY(LiveStacks) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 348277224c7aee..500ceb3d8b7006 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -160,7 +160,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer) INITIALIZE_PASS_DEPENDENCY(MachineScheduler) INITIALIZE_PASS_DEPENDENCY(LiveStacks) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) @@ -213,8 +213,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -2729,7 +2729,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { // SlotIndexes::getApproxInstrDistance. Indexes->packIndexes(); MBFI = &getAnalysis(); - DomTree = &getAnalysis(); + DomTree = &getAnalysis().getDomTree(); ORE = &getAnalysis().getORE(); Loops = &getAnalysis(); Bundles = &getAnalysis(); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp index aea92788057971..88ba843067e5aa 100644 --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -557,8 +557,8 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const { au.addPreserved(); au.addRequired(); au.addPreserved(); - au.addRequired(); - au.addPreserved(); + au.addRequired(); + au.addPreserved(); au.addRequired(); au.addPreserved(); MachineFunctionPass::getAnalysisUsage(au); diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index a4b2299abc20f9..fa9b7895239d38 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -225,7 +225,7 @@ class ShrinkWrap : public MachineFunctionPass { /// Initialize the pass for \p MF. void init(MachineFunction &MF) { RCI.runOnMachineFunction(MF); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MPDT = &getAnalysis(); Save = nullptr; Restore = nullptr; @@ -262,7 +262,7 @@ class ShrinkWrap : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -289,7 +289,7 @@ char &llvm::ShrinkWrapID = ShrinkWrap::ID; INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) @@ -670,7 +670,7 @@ bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF, Save = NewSave; Restore = NewRestore; - MDT->runOnMachineFunction(MF); + MDT->recalculate(MF); MPDT->runOnMachineFunction(MF); assert((MDT->dominates(Save, Restore) && MPDT->dominates(Restore, Save)) && diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp index 1a60e9abbe2e26..4cf025261d6204 100644 --- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -90,7 +90,7 @@ char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID; void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -98,7 +98,9 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { df_iterator_default_set Reachable; bool ModifiedPHI = false; - MachineDominatorTree *MDT = getAnalysisIfAvailable(); + MachineDominatorTreeWrapperPass *MDTWrapper = + getAnalysisIfAvailable(); + MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; MachineLoopInfo *MLI = getAnalysisIfAvailable(); // Mark all reachable blocks. diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp index d40725838c943c..a74362e8883972 100644 --- a/llvm/lib/CodeGen/XRayInstrumentation.cpp +++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -53,7 +53,7 @@ struct XRayInstrumentation : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -170,7 +170,9 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { if (!IgnoreLoops) { // Get MachineDominatorTree or compute it on the fly if it's unavailable - auto *MDT = getAnalysisIfAvailable(); + auto *MDTWrapper = + getAnalysisIfAvailable(); + auto *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; MachineDominatorTree ComputedMDT; if (!MDT) { ComputedMDT.getBase().recalculate(MF); diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 3f244ba10102af..154ae43b29d574 100644 --- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -50,7 +50,8 @@ struct LDTLSCleanup : public MachineFunctionPass { return false; } - MachineDominatorTree *DT = &getAnalysis(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); return VisitNode(DT->getRootNode(), 0); } @@ -138,7 +139,7 @@ struct LDTLSCleanup : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 2a4a3c0df08f9c..68243258a68f5d 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -126,7 +126,7 @@ char AArch64ConditionOptimizer::ID = 0; INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt", "AArch64 CondOpt Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt", "AArch64 CondOpt Pass", false, false) @@ -135,8 +135,8 @@ FunctionPass *llvm::createAArch64ConditionOptimizerPass() { } void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -332,7 +332,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) { return false; TII = MF.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis(); + DomTree = &getAnalysis().getDomTree(); MRI = &MF.getRegInfo(); bool Changed = false; diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 8c16a88a13a408..9a788123b1ffa3 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -795,7 +795,7 @@ char AArch64ConditionalCompares::ID = 0; INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", "AArch64 CCMP Pass", false, false) @@ -806,8 +806,8 @@ FunctionPass *llvm::createAArch64ConditionalCompares() { void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -933,7 +933,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); SchedModel = MF.getSubtarget().getSchedModel(); MRI = &MF.getRegInfo(); - DomTree = &getAnalysis(); + DomTree = &getAnalysis().getDomTree(); Loops = &getAnalysis(); MBPI = &getAnalysis(); Traces = &getAnalysis(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index 0c7be9f42c570d..f71fe323a6d358 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -524,8 +524,8 @@ void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); } @@ -557,7 +557,8 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis(); + IsOptNone ? nullptr + : &getAnalysis().getDomTree(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 31f77be20f348b..e9b25924b35f73 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -823,8 +823,8 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -856,7 +856,8 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); GISelKnownBits *KB = &getAnalysis().get(MF); - MachineDominatorTree *MDT = &getAnalysis(); + MachineDominatorTree *MDT = + &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp index a0c6bf7cc31c0a..8c914382b1ecb2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -46,7 +46,7 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -192,7 +192,7 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, "AMDGPU GlobalISel divergence lowering", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, @@ -209,7 +209,8 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( MachineFunction &MF) { - MachineDominatorTree &DT = getAnalysis(); + MachineDominatorTree &DT = + getAnalysis().getDomTree(); MachinePostDominatorTree &PDT = getAnalysis(); MachineUniformityInfo &MUI = getAnalysis().getUniformityInfo(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index f36374b08b34d5..46d44704af5a7d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -465,8 +465,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -494,7 +494,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis(); + IsOptNone ? nullptr + : &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 3f01a328afaf83..4d0cb467ba374d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -238,8 +238,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } AU.addRequired(); @@ -272,7 +272,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &STI = MF.getSubtarget(); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis(); + IsOptNone ? nullptr + : &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 35abd6eddde851..74f0540239c939 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -421,8 +421,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -449,7 +449,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { const auto *LI = ST.getLegalizerInfo(); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis(); + IsOptNone ? nullptr + : &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp index 2ea03ddb1fccd6..d1985f46b1c448 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -33,7 +33,7 @@ StringRef AMDGPURegBankSelect::getPassName() const { void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); // TODO: Preserve DomTree RegBankSelect::getAnalysisUsage(AU); } @@ -41,7 +41,7 @@ void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, "AMDGPU Register Bank Select", false, false) INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, "AMDGPU Register Bank Select", false, false) @@ -63,7 +63,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); MachineCycleInfo &CycleInfo = getAnalysis().getCycleInfo(); - MachineDominatorTree &DomTree = getAnalysis(); + MachineDominatorTree &DomTree = + getAnalysis().getDomTree(); MachineUniformityInfo Uniformity = computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index 0a96c643d9bdc6..b35f59bc5ba30b 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -113,7 +113,7 @@ class R600MachineCFGStructurizer : public MachineFunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -140,8 +140,8 @@ class R600MachineCFGStructurizer : public MachineFunctionPass { FuncRep = &MF; MLI = &getAnalysis(); LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); - MDT = &getAnalysis(); - LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr);); + MDT = &getAnalysis().getDomTree(); + LLVM_DEBUG(MDT->print(dbgs());); PDT = &getAnalysis(); LLVM_DEBUG(PDT->print(dbgs());); prepare(); @@ -1629,7 +1629,7 @@ void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer", "AMDGPU CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer", diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 77935cb4cde1a9..8bac570d59d4a6 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -103,8 +103,8 @@ class R600VectorRegMerger : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index 59e27478759096..64185db02ec1de 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -35,8 +35,8 @@ class R600Packetizer : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index a00ca625fc7390..68c5f23c8e11f3 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -162,8 +162,8 @@ class SIFixSGPRCopies : public MachineFunctionPass { StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -173,7 +173,7 @@ class SIFixSGPRCopies : public MachineFunctionPass { INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -611,8 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TRI = ST.getRegisterInfo(); TII = ST.getInstrInfo(); - MDT = &getAnalysis(); - + MDT = &getAnalysis().getDomTree(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index abb72e8e63c336..afc6353ec81167 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -48,8 +48,8 @@ class SILateBranchLowering : public MachineFunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -60,7 +60,7 @@ char SILateBranchLowering::ID = 0; INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) @@ -149,7 +149,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5dc3457b5bfae1..75a1575f2180ea 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -149,7 +149,7 @@ class SILowerControlFlow : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addUsedIfAvailable(); // Should preserve the same set that TwoAddressInstructions does. - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.addPreservedID(LiveVariablesID); @@ -764,7 +764,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { LIS = getAnalysisIfAvailable(); // This doesn't actually need LiveVariables, but we can preserve them. LV = getAnalysisIfAvailable(); - MDT = getAnalysisIfAvailable(); + auto *MDTWrapper = getAnalysisIfAvailable(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; MRI = &MF.getRegInfo(); BoolRC = TRI->getBoolRC(); diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 32dad0c425c041..9f0a9e03701b48 100644 --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -51,7 +51,7 @@ class SILowerI1Copies : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -399,7 +399,7 @@ class LoopFinder { INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, false) @@ -445,8 +445,9 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { MachineFunctionProperties::Property::Selected)) return false; - Vreg1LoweringHelper Helper(&TheMF, &getAnalysis(), - &getAnalysis()); + Vreg1LoweringHelper Helper( + &TheMF, &getAnalysis().getDomTree(), + &getAnalysis()); bool Changed = false; Changed |= Helper.lowerCopiesFromI1(); diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index 8204a70e72d916..18d66e4191522e 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -148,10 +148,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -618,7 +618,7 @@ char SIOptimizeVGPRLiveRange::ID = 0; INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE, "SI Optimize VGPR LiveRange", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE, @@ -635,7 +635,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); Loops = &getAnalysis(); LV = &getAnalysis(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 4b5f9bdd82b8db..4c5e60c873bb95 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3157,7 +3157,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const { - auto &MDT = LIS->getAnalysis(); + auto &MDT = LIS->getAnalysis().getDomTree(); SlotIndex UseIdx = LIS->getInstructionIndex(Use); SlotIndex DefIdx; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 913942dda19d91..811cd410fe31c6 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -241,7 +241,7 @@ class SIWholeQuadMode : public MachineFunctionPass { AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -259,7 +259,7 @@ char SIWholeQuadMode::ID = 0; INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, false) @@ -1687,7 +1687,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); LIS = &getAnalysis(); - MDT = getAnalysisIfAvailable(); + auto *MDTWrapper = getAnalysisIfAvailable(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; PDT = getAnalysisIfAvailable(); if (ST->isWave32()) { diff --git a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp index 0e3e4d34aa6a14..9d616e103f1717 100644 --- a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp +++ b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp @@ -61,7 +61,7 @@ char ARCBranchFinalize::ID = 0; INITIALIZE_PASS_BEGIN(ARCBranchFinalize, "arc-branch-finalize", "ARC finalize branches", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(ARCBranchFinalize, "arc-branch-finalize", "ARC finalize branches", false, false) diff --git a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp index e7a0b352db8d97..36f811c0aa003f 100644 --- a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp +++ b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp @@ -60,8 +60,8 @@ class ARCOptAddrMode : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } bool runOnMachineFunction(MachineFunction &MF) override; @@ -119,7 +119,7 @@ class ARCOptAddrMode : public MachineFunctionPass { char ARCOptAddrMode::ID = 0; INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false, false) @@ -508,7 +508,7 @@ bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) { AST = &MF.getSubtarget(); AII = AST->getInstrInfo(); MRI = &MF.getRegInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); bool Changed = false; for (auto &MBB : MF) diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 9579053943f9f0..90f5c6c40b49c6 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -229,7 +229,7 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -399,7 +399,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { isPositionIndependentOrROPI = STI->getTargetLowering()->isPositionIndependent() || STI->isROPI(); AFI = MF->getInfo(); - DT = &getAnalysis(); + DT = &getAnalysis().getDomTree(); isThumb = AFI->isThumbFunction(); isThumb1 = AFI->isThumb1OnlyFunction(); diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 4a5b672f862bec..e5e817f1ed9a29 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2161,8 +2161,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -2186,7 +2186,7 @@ char ARMPreAllocLoadStoreOpt::ID = 0; INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) @@ -2204,7 +2204,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); MRI = &Fn.getRegInfo(); - DT = &getAnalysis(); + DT = &getAnalysis().getDomTree(); MF = &Fn; AA = &getAnalysis().getAAResults(); diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index c9bbc41ac13bac..4882e8533caf1f 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -59,8 +59,8 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -93,7 +93,7 @@ INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, "ARM MVE TailPred and VPT Optimisations pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, "ARM MVE TailPred and VPT Optimisations pass", false, false) @@ -1065,7 +1065,8 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { TII = static_cast(STI.getInstrInfo()); MRI = &Fn.getRegInfo(); MachineLoopInfo *MLI = &getAnalysis(); - MachineDominatorTree *DT = &getAnalysis(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" << "********** Function: " << Fn.getName() << '\n'); diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index 4acdd571f6c968..97bdd4c45a8c68 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -218,7 +218,7 @@ class CSKYConstantIslands : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 4c18e076c43936..99745941d57986 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -219,8 +219,8 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -285,7 +285,7 @@ char HexagonBitSimplify::ID = 0; INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexagon-bit-simplify", "Hexagon bit simplification", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonBitSimplify, "hexagon-bit-simplify", "Hexagon bit simplification", false, false) @@ -2800,7 +2800,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) { auto &HRI = *HST.getRegisterInfo(); auto &HII = *HST.getInstrInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MachineRegisterInfo &MRI = MF.getRegInfo(); bool Changed; diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp index f2a02fe9540bfa..f0933765bbcbda 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -218,8 +218,8 @@ namespace { HexagonConstExtenders() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -569,7 +569,7 @@ namespace { INITIALIZE_PASS_BEGIN(HexagonConstExtenders, "hexagon-cext-opt", "Hexagon constant-extender optimization", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonConstExtenders, "hexagon-cext-opt", "Hexagon constant-extender optimization", false, false) @@ -1973,7 +1973,7 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) { HST = &MF.getSubtarget(); HII = HST->getInstrInfo(); HRI = HST->getRegisterInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MRI = &MF.getRegInfo(); AssignmentMap IMap; diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp index 97917270601bc8..a5c47e67de892f 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp @@ -50,8 +50,8 @@ class HexagonCopyHoisting : public MachineFunctionPass { AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index cb820e2158992d..03f6882e6889f1 100644 --- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -162,8 +162,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -1054,7 +1054,7 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) { TRI = ST.getRegisterInfo(); MFN = &MF; MRI = &MF.getRegInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MLI = &getAnalysis(); MBPI = EnableHexagonBP ? &getAnalysis() : nullptr; diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 204f3b6b20c751..8a23b7743e8395 100644 --- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -155,8 +155,8 @@ namespace { AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -254,7 +254,7 @@ namespace llvm { INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets", "Hexagon Expand Condsets", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets", @@ -1277,7 +1277,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) { HII = static_cast(MF.getSubtarget().getInstrInfo()); TRI = MF.getSubtarget().getRegisterInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); LIS = &getAnalysis(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 232651132d6e4f..a5d290a61f3283 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -413,7 +413,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF, auto &HRI = *MF.getSubtarget().getRegisterInfo(); MachineDominatorTree MDT; - MDT.runOnMachineFunction(MF); + MDT.calculate(MF); MachinePostDominatorTree MPT; MPT.runOnMachineFunction(MF); diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 1e373f6061bbfc..a4304b0531666a 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -515,8 +515,8 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -1497,7 +1497,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { HRI = ST.getRegisterInfo(); MFN = &MF; MRI = &MF.getRegInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); // Clean up before any further processing, so that dead code does not // get used in a newly generated "insert" instruction. Have a custom @@ -1607,6 +1607,6 @@ FunctionPass *llvm::createHexagonGenInsert() { INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert", "Hexagon generate \"insert\" instructions", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert", "Hexagon generate \"insert\" instructions", false, false) diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp index afd49631943f26..651ccc2db9ba24 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp @@ -56,8 +56,8 @@ class HexagonGenMemAbsolute : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } bool runOnMachineFunction(MachineFunction &Fn) override; @@ -82,7 +82,8 @@ bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) { MRI = &Fn.getRegInfo(); TRI = Fn.getRegInfo().getTargetRegisterInfo(); - MachineDominatorTree &MDT = getAnalysis(); + MachineDominatorTree &MDT = + getAnalysis().getDomTree(); // Loop over all of the basic blocks for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index 92e74327361150..5bb2d7d80ad541 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -93,8 +93,8 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -130,7 +130,7 @@ char HexagonGenPredicate::ID = 0; INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred", "Hexagon generate predicate operations", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred", "Hexagon generate predicate operations", false, false) diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 31e37dcce415f1..19a024078b1045 100644 --- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -118,7 +118,7 @@ namespace { StringRef getPassName() const override { return "Hexagon Hardware Loops"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -368,7 +368,7 @@ namespace { INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops", "Hexagon Hardware Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops", "Hexagon Hardware Loops", false, false) @@ -386,7 +386,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); MRI = &MF.getRegInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); const HexagonSubtarget &HST = MF.getSubtarget(); TII = HST.getInstrInfo(); TRI = HST.getRegisterInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp index 0e82bf6e5331da..e7f5c257b21c1f 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -70,7 +70,7 @@ class HexagonOptAddrMode : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); } @@ -122,7 +122,7 @@ char HexagonOptAddrMode::ID = 0; INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode", false, false) @@ -872,7 +872,7 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) { HII = HST.getInstrInfo(); HRI = HST.getRegisterInfo(); const auto &MDF = getAnalysis(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); DataFlowGraph G(MF, *HII, *HRI, *MDT, MDF); // Need to keep dead phis because we can propagate uses of registers into diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp index 4131f2a31755fb..3c17f680011496 100644 --- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp +++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -63,7 +63,7 @@ namespace { HexagonRDFOpt() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); @@ -109,7 +109,7 @@ char HexagonRDFOpt::ID = 0; INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "hexagon-rdf-opt", "Hexagon RDF optimizations", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(HexagonRDFOpt, "hexagon-rdf-opt", "Hexagon RDF optimizations", false, false) @@ -302,7 +302,7 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) { RDFCount++; } - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); const auto &MDF = getAnalysis(); const auto &HII = *MF.getSubtarget().getInstrInfo(); const auto &HRI = *MF.getSubtarget().getRegisterInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 56472d633694ae..2d5352b08caed7 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -97,9 +97,9 @@ namespace { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -124,7 +124,7 @@ char HexagonPacketizer::ID = 0; INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer", "Hexagon Packetizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) diff --git a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp index 902c7ceb869ae3..7ceb97642bba13 100644 --- a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -83,7 +83,7 @@ class OptimizePICCall : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -197,7 +197,8 @@ bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) { return false; // Do a pre-order traversal of the dominator tree. - MachineDominatorTree *MDT = &getAnalysis(); + MachineDominatorTree *MDT = + &getAnalysis().getDomTree(); bool Changed = false; SmallVector WorkList(1, MBBInfo(MDT->getRootNode())); diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp index 0578655f0443a6..bd8a065011c928 100644 --- a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp @@ -110,8 +110,8 @@ void MipsPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } MachineFunctionPass::getAnalysisUsage(AU); } @@ -139,7 +139,8 @@ bool MipsPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = - IsOptNone ? nullptr : &getAnalysis(); + IsOptNone ? nullptr + : &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); MipsPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp index 799890928577c4..bf632801646484 100644 --- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -165,7 +165,7 @@ class PPCBranchCoalescing : public MachineFunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -195,7 +195,7 @@ FunctionPass *llvm::createPPCBranchCoalescingPass() { INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing", false, false) @@ -214,7 +214,7 @@ void PPCBranchCoalescing::CoalescingCandidateInfo::clear() { } void PPCBranchCoalescing::initialize(MachineFunction &MF) { - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MPDT = &getAnalysis(); TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp index 1f9947f6f32711..c4190bb9a1c4ed 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp @@ -55,7 +55,7 @@ namespace { } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -70,7 +70,7 @@ namespace { INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) @@ -160,7 +160,7 @@ static bool verifyCTRBranch(MachineBasicBlock *MBB, } bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before // any other instructions that might clobber the ctr register. diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index c6db8a7bbeb855..c57b48055d2ad7 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -156,11 +156,11 @@ struct PPCMIPeephole : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -200,7 +200,7 @@ void PPCMIPeephole::addRegToUpdateWithLine(Register Reg, int Line) { void PPCMIPeephole::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); MPDT = &getAnalysis(); MBFI = &getAnalysis(); LV = &getAnalysis(); @@ -2029,7 +2029,7 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI, INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, "PowerPC MI Peephole Optimization", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 0504db239f6713..d1cc2ad5c481ff 100644 --- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -427,7 +427,7 @@ class PPCReduceCRLogicals : public MachineFunctionPass { CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -730,7 +730,7 @@ void PPCReduceCRLogicals::collectCRLogicals() { INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE, "PowerPC Reduce CR logical Operation", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE, "PowerPC Reduce CR logical Operation", false, false) diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 0d8c71f9f2e699..69e046972f3d43 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -368,8 +368,8 @@ namespace { AU.addPreserved(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -379,7 +379,7 @@ INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE, "PowerPC VSX FMA Mutation", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE, "PowerPC VSX FMA Mutation", false, false) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp index 9c28944abc7672..8fa9dba2853876 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp @@ -112,8 +112,8 @@ void RISCVPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -143,7 +143,8 @@ bool RISCVPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const auto *LI = ST.getLegalizerInfo(); GISelKnownBits *KB = &getAnalysis().get(MF); - MachineDominatorTree *MDT = &getAnalysis(); + MachineDominatorTree *MDT = + &getAnalysis().getDomTree(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp index 9a35fffae05890..6a695119be25a2 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp @@ -109,8 +109,8 @@ void RISCVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); @@ -142,7 +142,8 @@ bool RISCVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); GISelKnownBits *KB = &getAnalysis().get(MF); - MachineDominatorTree *MDT = &getAnalysis(); + MachineDominatorTree *MDT = + &getAnalysis().getDomTree(); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp index 8073ed0e2a3c87..bf8d109ff71f36 100644 --- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp @@ -58,7 +58,7 @@ FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) { void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -75,7 +75,8 @@ bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) { return false; } - MachineDominatorTree *DT = &getAnalysis(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); return VisitNode(DT->getRootNode(), 0); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index 06758e46519727..f746bf4307a08a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -53,8 +53,8 @@ class WebAssemblyCFGSort final : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); @@ -387,7 +387,7 @@ bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) { const auto &MLI = getAnalysis(); const auto &WEI = getAnalysis(); - auto &MDT = getAnalysis(); + auto &MDT = getAnalysis().getDomTree(); // Liveness is not tracked for VALUE_STACK physreg. MF.getRegInfo().invalidateLiveness(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index d8cbddf74545da..77e82a32545f1c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -48,7 +48,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { StringRef getPassName() const override { return "WebAssembly CFG Stackify"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -252,7 +252,7 @@ void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) { void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { assert(!MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); - auto &MDT = getAnalysis(); + auto &MDT = getAnalysis().getDomTree(); const auto &TII = *MF.getSubtarget().getInstrInfo(); const auto &MFI = *MF.getInfo(); @@ -465,7 +465,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) { void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { assert(MBB.isEHPad()); MachineFunction &MF = *MBB.getParent(); - auto &MDT = getAnalysis(); + auto &MDT = getAnalysis().getDomTree(); const auto &TII = *MF.getSubtarget().getInstrInfo(); const auto &MLI = getAnalysis(); const auto &WEI = getAnalysis(); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp index 8deac76b2bc3d9..f23f21c8f69fb9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp @@ -31,7 +31,7 @@ char WebAssemblyExceptionInfo::ID = 0; INITIALIZE_PASS_BEGIN(WebAssemblyExceptionInfo, DEBUG_TYPE, "WebAssembly Exception Information", true, true) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(WebAssemblyExceptionInfo, DEBUG_TYPE, "WebAssembly Exception Information", true, true) @@ -45,7 +45,7 @@ bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &MF) { ExceptionHandling::Wasm || !MF.getFunction().hasPersonalityFn()) return false; - auto &MDT = getAnalysis(); + auto &MDT = getAnalysis().getDomTree(); auto &MDF = getAnalysis(); recalculate(MF, MDT, MDF); LLVM_DEBUG(dump()); @@ -273,7 +273,7 @@ void WebAssemblyExceptionInfo::releaseMemory() { void WebAssemblyExceptionInfo::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp index 2180f57c106a7f..2ab5bcdd838d00 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp @@ -56,8 +56,8 @@ class WebAssemblyMemIntrinsicResults final : public MachineFunctionPass { AU.setPreservesCFG(); AU.addRequired(); AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); @@ -180,7 +180,7 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) { }); MachineRegisterInfo &MRI = MF.getRegInfo(); - auto &MDT = getAnalysis(); + auto &MDT = getAnalysis().getDomTree(); const WebAssemblyTargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); const auto &LibInfo = diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index d4edb6bf18d932..e38905c20b8390 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -48,13 +48,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); AU.addPreservedID(LiveVariablesID); - AU.addPreserved(); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -813,7 +813,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { WebAssemblyFunctionInfo &MFI = *MF.getInfo(); const auto *TII = MF.getSubtarget().getInstrInfo(); const auto *TRI = MF.getSubtarget().getRegisterInfo(); - auto &MDT = getAnalysis(); + auto &MDT = getAnalysis().getDomTree(); auto &LIS = getAnalysis(); // Walk the instructions from the bottom up. Currently we don't look past diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index d6d077363f6fbb..9bd4e783f58398 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -127,7 +127,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() { char X86FlagsCopyLoweringPass::ID = 0; void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -257,7 +257,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = Subtarget->getInstrInfo(); TRI = Subtarget->getRegisterInfo(); - MDT = &getAnalysis(); + MDT = &getAnalysis().getDomTree(); PromoteRC = &X86::GR8RegClass; if (MF.empty()) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 1f93d293bc2aab..ce1bbc8a959bfb 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -10324,7 +10324,8 @@ struct LDTLSCleanup : public MachineFunctionPass { return false; } - MachineDominatorTree *DT = &getAnalysis(); + MachineDominatorTree *DT = + &getAnalysis().getDomTree(); return VisitNode(DT->getRootNode(), 0); } @@ -10411,7 +10412,7 @@ struct LDTLSCleanup : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 4dfe7556df0030..fff5d17160230e 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -237,7 +237,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage( AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.setPreservesCFG(); } @@ -270,7 +270,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction( TRI = STI->getRegisterInfo(); LLVM_DEBUG(dbgs() << "Building gadget graph...\n"); const auto &MLI = getAnalysis(); - const auto &MDT = getAnalysis(); + const auto &MDT = getAnalysis().getDomTree(); const auto &MDF = getAnalysis(); std::unique_ptr Graph = getGadgetGraph(MF, MLI, MDT, MDF); LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n"); @@ -801,7 +801,7 @@ bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch( INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, "X86 LVI load hardening", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, "X86 LVI load hardening", false, false) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp index b97d75a822f753..5f0697f5aaad75 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp @@ -65,7 +65,7 @@ static bool shouldNotRemoveInstruction(const TargetInstrInfo &TII, static void extractInstrFromFunction(Oracle &O, MachineFunction &MF) { MachineDominatorTree MDT; - MDT.runOnMachineFunction(MF); + MDT.calculate(MF); auto MRI = &MF.getRegInfo(); SetVector ToDelete; diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp index d4e214c43e98c6..98b154f86c949f 100644 --- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp +++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp @@ -167,7 +167,7 @@ body: | WebAssemblyExceptionInfo WEI; MachineDominatorTree MDT; MachineDominanceFrontier MDF; - MDT.runOnMachineFunction(*MF); + MDT.calculate(*MF); MDF.getBase().analyze(MDT.getBase()); WEI.recalculate(*MF, MDT, MDF); @@ -342,7 +342,7 @@ body: | WebAssemblyExceptionInfo WEI; MachineDominatorTree MDT; MachineDominanceFrontier MDF; - MDT.runOnMachineFunction(*MF); + MDT.calculate(*MF); MDF.getBase().analyze(MDT.getBase()); WEI.recalculate(*MF, MDT, MDF); From a7d28f5a4d41a176a62bddf04be6478550f98df2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Jun 2024 14:35:15 +0100 Subject: [PATCH 72/82] [X86] early-ifcvt-remarks.ll - add codegen checks --- llvm/test/CodeGen/X86/early-ifcvt-remarks.ll | 83 +++++++++++++++++--- 1 file changed, 74 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll b/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll index 4e07070fba0d1e..85c9aadbf70481 100644 --- a/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll +++ b/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc %s -x86-early-ifcvt -pass-remarks='early-ifcvt' -pass-remarks-missed='early-ifcvt' -mcpu=k8 -o - 2>&1 | FileCheck %s target triple = "x86_64-none-none" @@ -6,7 +7,26 @@ target triple = "x86_64-none-none" ; CHECK-SAME: and the short leg adds another {{[0-9]+}} cycles{{s?}}, ; CHECK-SAME: and the long leg adds another {{[0-9]+}} cycles{{s?}}, ; CHECK-SAME: each staying under the threshold of {{[0-9]+}} cycles{{s?}}. + +; CHECK: remark: :0:0: did not if-convert branch: +; CHECK-SAME: the condition would add {{[0-9]+}} cycles{{s?}} to the critical path, +; CHECK-SAME: and the short leg would add another {{[0-9]+}} cycles{{s?}}, +; CHECK-SAME: and the long leg would add another {{[0-9]+}} cycles{{s?}} exceeding the limit of {{[0-9]+}} cycles{{s?}}. + +; CHECK: remark: :0:0: did not if-convert branch: +; CHECK-SAME: the resulting critical path ({{[0-9]+}} cycles{{s?}}) +; CHECK-SAME: would extend the shorter leg's critical path ({{[0-9]+}} cycle{{s?}}) +; CHECK-SAME: by more than the threshold of {{[0-9]+}} cycles{{s?}}, +; CHECK-SAME: which cannot be hidden by available ILP. + define i32 @mm1(i1 %pred, i32 %val) { +; CHECK-LABEL: mm1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: leal 1(%rsi), %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: cmovel %esi, %eax +; CHECK-NEXT: retq entry: br i1 %pred, label %if.true, label %if.else @@ -19,11 +39,20 @@ if.else: ret i32 %res } -; CHECK: remark: :0:0: did not if-convert branch: -; CHECK-SAME: the condition would add {{[0-9]+}} cycles{{s?}} to the critical path, -; CHECK-SAME: and the short leg would add another {{[0-9]+}} cycles{{s?}}, -; CHECK-SAME: and the long leg would add another {{[0-9]+}} cycles{{s?}} exceeding the limit of {{[0-9]+}} cycles{{s?}}. define i32 @mm2(i1 %pred, i32 %val, i32 %e1, i32 %e2, i32 %e3, i32 %e4, i32 %e5) { +; CHECK-LABEL: mm2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: je .LBB1_2 +; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: addl %ecx, %r8d +; CHECK-NEXT: addl %edx, %r8d +; CHECK-NEXT: addl %r8d, %r9d +; CHECK-NEXT: movl %r9d, %eax +; CHECK-NEXT: .LBB1_2: # %if.else +; CHECK-NEXT: retq entry: br i1 %pred, label %if.true, label %if.else @@ -39,12 +68,48 @@ if.else: ret i32 %res } -; CHECK: did not if-convert branch: -; CHECK-SAME: the resulting critical path ({{[0-9]+}} cycles{{s?}}) -; CHECK-SAME: would extend the shorter leg's critical path ({{[0-9]+}} cycle{{s?}}) -; CHECK-SAME: by more than the threshold of {{[0-9]+}} cycles{{s?}}, -; CHECK-SAME: which cannot be hidden by available ILP. define i32 @mm3(i1 %pred, i32 %val, i32 %e1, i128 %e2, i128 %e3, i128 %e4, i128 %e5) { +; CHECK-LABEL: mm3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: testb $1, %dil +; CHECK-NEXT: movl %esi, %r10d +; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: # %bb.1: # %if.false +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: imull %edx, %edx +; CHECK-NEXT: movslq %edx, %r10 +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: movl %edx, %r9d +; CHECK-NEXT: mulq %r10 +; CHECK-NEXT: imulq %r10, %r8 +; CHECK-NEXT: sarq $63, %r10 +; CHECK-NEXT: imulq %rcx, %r10 +; CHECK-NEXT: addq %rdx, %r8 +; CHECK-NEXT: addq %r10, %r8 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 +; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: xorq %r8, %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: xorq %rsi, %r10 +; CHECK-NEXT: xorq %rax, %r10 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: sarq %cl, %rax +; CHECK-NEXT: addq %rdi, %rdi +; CHECK-NEXT: notb %cl +; CHECK-NEXT: shlq %cl, %rdi +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: shrq %cl, %r10 +; CHECK-NEXT: orq %rdi, %r10 +; CHECK-NEXT: testb $64, %sil +; CHECK-NEXT: cmovneq %rax, %r10 +; CHECK-NEXT: movl %r9d, %eax +; CHECK-NEXT: .LBB2_2: # %if.endif +; CHECK-NEXT: addl %r10d, %eax +; CHECK-NEXT: retq entry: br i1 %pred, label %if.true, label %if.false From bc5ced54cc8d16332eacfc5f842aea3212e8ab5e Mon Sep 17 00:00:00 2001 From: Sergei Lebedev <185856+superbobry@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:46:43 +0100 Subject: [PATCH 73/82] Updated the annotations of Python bindings (#92733) --- mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 118 ++++++++++++++--------- 1 file changed, 72 insertions(+), 46 deletions(-) diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index 586bf7f8e93fba..1e1b2a8348b1d7 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -479,7 +479,7 @@ class AffineExpr: class Attribute: @staticmethod - def parse(asm: str, context: Optional[Context] = None) -> Attribute: + def parse(asm: str | bytes, context: Optional[Context] = None) -> Attribute: """ Parses an attribute from an assembly form. Raises an MLIRError on failure. """ @@ -520,7 +520,7 @@ class Attribute: class Type: @staticmethod - def parse(asm: str, context: Optional[Context] = None) -> Type: + def parse(asm: str | bytes, context: Optional[Context] = None) -> Type: """ Parses the assembly form of a type. @@ -741,7 +741,7 @@ class AffineMap: def results(self) -> "AffineMapExprList": ... class AffineMapAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(affine_map: AffineMap) -> AffineMapAttr: """ @@ -779,7 +779,7 @@ class AffineSymbolExpr(AffineExpr): def position(self) -> int: ... class ArrayAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(attributes: List, context: Optional[Context] = None) -> ArrayAttr: """ @@ -823,7 +823,7 @@ class AttrBuilder: """ class BF16Type(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> BF16Type: """ @@ -909,6 +909,11 @@ class BlockArgument(Value): def owner(self) -> Block: ... class BlockArgumentList: + @overload + def __getitem__(self, arg0: int) -> BlockArgument: ... + @overload + def __getitem__(self, arg0: slice) -> BlockArgumentList: ... + def __len__(self) -> int: ... def __add__(self, arg0: BlockArgumentList) -> List[BlockArgument]: ... @property def types(self) -> List[Type]: ... @@ -955,7 +960,7 @@ class BoolAttr(Attribute): """ class ComplexType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(arg0: Type) -> ComplexType: """ @@ -1016,7 +1021,7 @@ class Context: class DenseBoolArrayAttr(Attribute): @staticmethod def get( - values: List[bool], context: Optional[Context] = None + values: Sequence[bool], context: Optional[Context] = None ) -> DenseBoolArrayAttr: """ Gets a uniqued dense array attribute @@ -1113,7 +1118,7 @@ class DenseElementsAttr(Attribute): class DenseF32ArrayAttr(Attribute): @staticmethod def get( - values: List[float], context: Optional[Context] = None + values: Sequence[float], context: Optional[Context] = None ) -> DenseF32ArrayAttr: """ Gets a uniqued dense array attribute @@ -1141,7 +1146,7 @@ class DenseF32ArrayIterator: class DenseF64ArrayAttr(Attribute): @staticmethod def get( - values: List[float], context: Optional[Context] = None + values: Sequence[float], context: Optional[Context] = None ) -> DenseF64ArrayAttr: """ Gets a uniqued dense array attribute @@ -1167,6 +1172,14 @@ class DenseF64ArrayIterator: def __next__(self) -> float: ... class DenseFPElementsAttr(DenseElementsAttr): + @staticmethod + def get( + array: Buffer, + signless: bool = True, + type: Optional[Type] = None, + shape: Optional[List[int]] = None, + context: Optional[Context] = None, + ) -> DenseFPElementsAttr: ... @staticmethod def isinstance(other: Attribute) -> bool: ... def __getitem__(self, arg0: int) -> float: ... @@ -1180,7 +1193,7 @@ class DenseFPElementsAttr(DenseElementsAttr): class DenseI16ArrayAttr(Attribute): @staticmethod - def get(values: List[int], context: Optional[Context] = None) -> DenseI16ArrayAttr: + def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI16ArrayAttr: """ Gets a uniqued dense array attribute """ @@ -1206,7 +1219,7 @@ class DenseI16ArrayIterator: class DenseI32ArrayAttr(Attribute): @staticmethod - def get(values: List[int], context: Optional[Context] = None) -> DenseI32ArrayAttr: + def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI32ArrayAttr: """ Gets a uniqued dense array attribute """ @@ -1232,7 +1245,7 @@ class DenseI32ArrayIterator: class DenseI64ArrayAttr(Attribute): @staticmethod - def get(values: List[int], context: Optional[Context] = None) -> DenseI64ArrayAttr: + def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI64ArrayAttr: """ Gets a uniqued dense array attribute """ @@ -1258,7 +1271,7 @@ class DenseI64ArrayIterator: class DenseI8ArrayAttr(Attribute): @staticmethod - def get(values: List[int], context: Optional[Context] = None) -> DenseI8ArrayAttr: + def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI8ArrayAttr: """ Gets a uniqued dense array attribute """ @@ -1283,6 +1296,14 @@ class DenseI8ArrayIterator: def __next__(self) -> int: ... class DenseIntElementsAttr(DenseElementsAttr): + @staticmethod + def get( + array: Buffer, + signless: bool = True, + type: Optional[Type] = None, + shape: Optional[List[int]] = None, + context: Optional[Context] = None, + ) -> DenseIntElementsAttr: ... @staticmethod def isinstance(other: Attribute) -> bool: ... def __getitem__(self, arg0: int) -> int: ... @@ -1422,7 +1443,7 @@ class Dialects: def __getitem__(self, arg0: str) -> Dialect: ... class DictAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(value: Dict = {}, context: Optional[Context] = None) -> DictAttr: """ @@ -1453,7 +1474,7 @@ class FloatType(Type): """ class F16Type(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> F16Type: """ @@ -1466,7 +1487,7 @@ class F16Type(FloatType): def typeid(self) -> TypeID: ... class F32Type(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> F32Type: """ @@ -1479,7 +1500,7 @@ class F32Type(FloatType): def typeid(self) -> TypeID: ... class F64Type(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> F64Type: """ @@ -1513,7 +1534,7 @@ class FlatSymbolRefAttr(Attribute): """ class Float8E4M3B11FNUZType(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> Float8E4M3B11FNUZType: """ @@ -1526,7 +1547,7 @@ class Float8E4M3B11FNUZType(FloatType): def typeid(self) -> TypeID: ... class Float8E4M3FNType(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> Float8E4M3FNType: """ @@ -1539,7 +1560,7 @@ class Float8E4M3FNType(FloatType): def typeid(self) -> TypeID: ... class Float8E4M3FNUZType(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> Float8E4M3FNUZType: """ @@ -1552,7 +1573,7 @@ class Float8E4M3FNUZType(FloatType): def typeid(self) -> TypeID: ... class Float8E5M2FNUZType(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> Float8E5M2FNUZType: """ @@ -1565,7 +1586,7 @@ class Float8E5M2FNUZType(FloatType): def typeid(self) -> TypeID: ... class Float8E5M2Type(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> Float8E5M2Type: """ @@ -1578,7 +1599,7 @@ class Float8E5M2Type(FloatType): def typeid(self) -> TypeID: ... class FloatAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(type: Type, value: float, loc: Optional[Location] = None) -> FloatAttr: """ @@ -1612,7 +1633,7 @@ class FloatAttr(Attribute): """ class FloatTF32Type(FloatType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> FloatTF32Type: """ @@ -1625,7 +1646,7 @@ class FloatTF32Type(FloatType): def typeid(self) -> TypeID: ... class FunctionType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( inputs: List[Type], results: List[Type], context: Optional[Context] = None @@ -1650,7 +1671,7 @@ class FunctionType(Type): def typeid(self) -> TypeID: ... class IndexType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> IndexType: """ @@ -1766,7 +1787,7 @@ class InsertionPoint: """ class IntegerAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(type: Type, value: int) -> IntegerAttr: """ @@ -1855,7 +1876,7 @@ class IntegerSetConstraintList: def __len__(self) -> int: ... class IntegerType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get_signed(width: int, context: Optional[Context] = None) -> IntegerType: """ @@ -1967,7 +1988,7 @@ class Location: """ class MemRefType(ShapedType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( shape: List[int], @@ -2007,7 +2028,7 @@ class Module: Creates an empty module """ @staticmethod - def parse(asm: str, context: Optional[Context] = None) -> Module: + def parse(asm: str | bytes, context: Optional[Context] = None) -> Module: """ Parses a module's assembly format from a string. @@ -2064,7 +2085,7 @@ class NamedAttribute: """ class NoneType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> NoneType: """ @@ -2130,7 +2151,12 @@ class OpResultList: class OpSuccessors: def __add__(self, arg0: OpSuccessors) -> List[Block]: ... + @overload + def __getitem__(self, arg0: int) -> Block: ... + @overload + def __getitem__(self, arg0: slice) -> OpSuccessors: ... def __setitem__(self, arg0: int, arg1: Block) -> None: ... + def __len__(self) -> int: ... class OpView(_OperationBase): _ODS_OPERAND_SEGMENTS: ClassVar[None] = ... @@ -2154,7 +2180,7 @@ class OpView(_OperationBase): @classmethod def parse( cls: _Type[_TOperation], - source: str, + source: str | bytes, *, source_name: str = "", context: Optional[Context] = None, @@ -2174,7 +2200,7 @@ class OpView(_OperationBase): """ class OpaqueAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( dialect_namespace: str, @@ -2204,7 +2230,7 @@ class OpaqueAttr(Attribute): def typeid(self) -> TypeID: ... class OpaqueType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( dialect_namespace: str, buffer: str, context: Optional[Context] = None @@ -2262,7 +2288,7 @@ class Operation(_OperationBase): """ @staticmethod def parse( - source: str, *, source_name: str = "", context: Optional[Context] = None + source: str | bytes, *, source_name: str = "", context: Optional[Context] = None ) -> Operation: """ Parses an operation. Supports both text assembly format and binary bytecode format. @@ -2290,7 +2316,7 @@ class OperationList: def __len__(self) -> int: ... class RankedTensorType(ShapedType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( shape: List[int], @@ -2443,7 +2469,7 @@ class ShapedTypeComponents: """ class StridedLayoutAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( offset: int, strides: List[int], context: Optional[Context] = None @@ -2477,9 +2503,9 @@ class StridedLayoutAttr(Attribute): def typeid(self) -> TypeID: ... class StringAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod - def get(value: str, context: Optional[Context] = None) -> StringAttr: + def get(value: str | bytes, context: Optional[Context] = None) -> StringAttr: """ Gets a uniqued string attribute """ @@ -2554,9 +2580,9 @@ class SymbolTable: def insert(self, operation: _OperationBase) -> Attribute: ... class TupleType(Type): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod - def get_Tuple(elements: List[Type], context: Optional[Context] = None) -> TupleType: + def get_tuple(elements: List[Type], context: Optional[Context] = None) -> TupleType: """ Create a Tuple type """ @@ -2576,7 +2602,7 @@ class TupleType(Type): def typeid(self) -> TypeID: ... class TypeAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(value: Type, context: Optional[Context] = None) -> TypeAttr: """ @@ -2603,7 +2629,7 @@ class TypeID: def _CAPIPtr(self) -> object: ... class UnitAttr(Attribute): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(context: Optional[Context] = None) -> UnitAttr: """ @@ -2618,7 +2644,7 @@ class UnitAttr(Attribute): def typeid(self) -> TypeID: ... class UnrankedMemRefType(ShapedType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( element_type: Type, memory_space: Attribute, loc: Optional[Location] = None @@ -2638,7 +2664,7 @@ class UnrankedMemRefType(ShapedType): def typeid(self) -> TypeID: ... class UnrankedTensorType(ShapedType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get(element_type: Type, loc: Optional[Location] = None) -> UnrankedTensorType: """ @@ -2651,7 +2677,7 @@ class UnrankedTensorType(ShapedType): def typeid(self) -> TypeID: ... class VectorType(ShapedType): - static_typeid: ClassVar[TypeID] # value = + static_typeid: ClassVar[TypeID] @staticmethod def get( shape: List[int], From 37e9bf9d151ec088685eb7f08c5124981eb5a82d Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 11 Jun 2024 15:48:29 +0200 Subject: [PATCH 74/82] [bazel] Add missing dependency for 3cc2710e0dd53bb82742904fa13014018a1137ed --- utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index fdf89d00cbb1d1..448d2ed9201acf 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -438,6 +438,7 @@ cc_library( srcs = ["lib/Dialect/Test/TestToLLVMIRTranslation.cpp"], deps = [ ":TestDialect", + "//llvm:Core", "//llvm:Support", "//mlir:BuiltinToLLVMIRTranslation", "//mlir:IR", From 2eb60e2de812187839954d2ce35e1479817631db Mon Sep 17 00:00:00 2001 From: Johannes Doerfert Date: Tue, 11 Jun 2024 17:05:04 +0300 Subject: [PATCH 75/82] [Offload][NFCI] Initialize the KernelArgsTy to default values (#95117) Co-authored-by: Joseph Huber --- offload/include/Shared/APITypes.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index e8fc27785b6c2e..a84b685eeedcec 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -89,22 +89,26 @@ struct __tgt_async_info { /// This struct contains all of the arguments to a target kernel region launch. struct KernelArgsTy { - uint32_t Version; // Version of this struct for ABI compatibility. - uint32_t NumArgs; // Number of arguments in each input pointer. - void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct). - void **ArgPtrs; // Pointer to the argument data. - int64_t *ArgSizes; // Size of the argument data in bytes. - int64_t *ArgTypes; // Type of the data (e.g. to / from). - void **ArgNames; // Name of the data for debugging, possibly null. - void **ArgMappers; // User-defined mappers, possibly null. - uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise. + uint32_t Version = 0; // Version of this struct for ABI compatibility. + uint32_t NumArgs = 0; // Number of arguments in each input pointer. + void **ArgBasePtrs = + nullptr; // Base pointer of each argument (e.g. a struct). + void **ArgPtrs = nullptr; // Pointer to the argument data. + int64_t *ArgSizes = nullptr; // Size of the argument data in bytes. + int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from). + void **ArgNames = nullptr; // Name of the data for debugging, possibly null. + void **ArgMappers = nullptr; // User-defined mappers, possibly null. + uint64_t Tripcount = + 0; // Tripcount for the teams / distribute loop, 0 otherwise. struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. uint64_t Unused : 63; - } Flags; - uint32_t NumTeams[3]; // The number of teams (for x,y,z dimension). - uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension). - uint32_t DynCGroupMem; // Amount of dynamic cgroup memory requested. + } Flags = {0, 0}; + // The number of teams (for x,y,z dimension). + uint32_t NumTeams[3] = {0, 0, 0}; + // The number of threads (for x,y,z dimension). + uint32_t ThreadLimit[3] = {0, 0, 0}; + uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested. }; static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t), "Invalid struct size"); From d83f37f039dbd96918feb94c441882f71d772e55 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 11 Jun 2024 15:14:21 +0100 Subject: [PATCH 76/82] Revert#2 "[MLIR][Flang][DebugInfo] Set debug info format in MLIR->IR translation (#95098)" Also reverts "[MLIR][Flang][DebugInfo] Convert debug format in MLIR translators" The patch above introduces behaviour controlled by an LLVM flag into the Flang driver, which is incorrect behaviour. This reverts commits: 3cc2710e0dd53bb82742904fa13014018a1137ed. 460408f78b30720950040e336f7b566aa7203269. --- flang/lib/Frontend/FrontendActions.cpp | 10 ---------- mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 10 ---------- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 10 ---------- mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp | 10 ---------- 4 files changed, 40 deletions(-) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index a74d2be9be3b1c..b1b6391f1439c6 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -50,7 +50,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Bitcode/BitcodeWriterPass.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" -#include "llvm/IR/DebugProgramInstruction.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" @@ -82,8 +81,6 @@ using namespace Fortran::frontend; llvm::PassPluginLibraryInfo get##Ext##PluginInfo(); #include "llvm/Support/Extension.def" -extern llvm::cl::opt WriteNewDbgInfoFormat; - /// Save the given \c mlirModule to a temporary .mlir file, in a location /// decided by the -save-temps flag. No files are produced if the flag is not /// specified. @@ -1274,13 +1271,6 @@ void CodeGenAction::executeAction() { runOptimizationPipeline(ci.isOutputStreamNull() ? *os : ci.getOutputStream()); if (action == BackendActionTy::Backend_EmitLL) { - // When printing LLVM IR, we should convert the module to the debug info - // format that LLVM expects us to print. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, - WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) - llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(ci.isOutputStreamNull() ? *os : ci.getOutputStream(), /*AssemblyAnnotationWriter=*/nullptr); return; diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp index be3b36c7620559..45588937795348 100644 --- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp @@ -16,12 +16,9 @@ #include "mlir/Target/LLVMIR/Dialect/All.h" #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Tools/mlir-translate/Translation.h" -#include "llvm/IR/DebugProgramInstruction.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -extern llvm::cl::opt WriteNewDbgInfoFormat; - using namespace mlir; namespace mlir { @@ -34,13 +31,6 @@ void registerToLLVMIRTranslation() { if (!llvmModule) return failure(); - // When printing LLVM IR, we should convert the module to the debug info - // format that LLVM expects us to print. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, - WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) - llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); }, diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index e1a60f195fe89c..7b86b250c294b4 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -64,8 +64,6 @@ using namespace mlir; using namespace mlir::LLVM; using namespace mlir::LLVM::detail; -extern llvm::cl::opt UseNewDbgInfoFormat; - #include "mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc" namespace { @@ -1791,9 +1789,6 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext, StringRef name) { m->getContext()->getOrLoadDialect(); auto llvmModule = std::make_unique(name, llvmContext); - // ModuleTranslation can currently only construct modules in the old debug - // info format, so set the flag accordingly. - llvmModule->setNewDbgInfoFormatFlag(false); if (auto dataLayoutAttr = m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) { llvmModule->setDataLayout(cast(dataLayoutAttr).getValue()); @@ -1872,11 +1867,6 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, if (failed(translator.convertFunctions())) return nullptr; - // Once we've finished constructing elements in the module, we should convert - // it to use the debug info format desired by LLVM. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - translator.llvmModule->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); - if (!disableVerification && llvm::verifyModule(*translator.llvmModule, &llvm::errs())) return nullptr; diff --git a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp index 813b4960faa94d..57e7d658fb501f 100644 --- a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp +++ b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp @@ -22,9 +22,6 @@ #include "mlir/Tools/mlir-translate/Translation.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/TypeSwitch.h" -#include "llvm/IR/DebugProgramInstruction.h" - -extern llvm::cl::opt WriteNewDbgInfoFormat; using namespace mlir; @@ -125,13 +122,6 @@ void registerTestToLLVMIR() { if (!llvmModule) return failure(); - // When printing LLVM IR, we should convert the module to the debug info - // format that LLVM expects us to print. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - llvm::ScopedDbgInfoFormatSetter FormatSetter(*llvmModule, - WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) - llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); }, From c0e1ad779f8b7b0073b89ecdd44c3b9c4a72e494 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Tue, 11 Jun 2024 16:17:25 +0200 Subject: [PATCH 77/82] [lldb] Skip declaration DIEs in the debug_names index (#94744) This makes sure we try to process declaration DIEs that are erroneously present in the index. Until bd5c6367bd7, clang was emitting index entries for declaration DIEs with DW_AT_signature attributes. This makes sure to avoid returning those DIEs as the definitions of a type, but also makes sure to pass through DIEs referring to static constexpr member variables, which is a (probably nonconforming) extension used by dsymutil. It adds test cases for both of the scenarios. It is essentially a recommit of #91808. --- .../SymbolFile/DWARF/DebugNamesDWARFIndex.cpp | 5 + .../DWARF/x86/debug-names-signature.s | 265 ++++++++++++++++++ .../x86/debug-names-static-constexpr-member.s | 169 +++++++++++ 3 files changed, 439 insertions(+) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s create mode 100644 lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp index 90e42be7202d87..1d17f20670eed4 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp @@ -85,6 +85,11 @@ bool DebugNamesDWARFIndex::ProcessEntry( DWARFDIE die = GetDIE(entry); if (!die) return true; + // Clang used to erroneously emit index entries for declaration DIEs in case + // when the definition is in a type unit (llvm.org/pr77696). + if (die.IsStructUnionOrClass() && + die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0)) + return true; return callback(die); } diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s new file mode 100644 index 00000000000000..7b845a72bbed46 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s @@ -0,0 +1,265 @@ +## Test that we can correctly complete types even if the debug_names index +## contains entries referring to declaration dies (clang emitted entries like +## that until bd5c6367bd7). +## +## This test consists of two compile units and one type unit. CU1 has the +## definition of a variable, but only a forward-declaration of its type. When +## attempting to find a definition, the debug_names lookup will return the DIE +## in CU0, which is also a forward-declaration (with a reference to a type +## unit). LLDB needs to find the definition of the type within the type unit. + +# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t +# RUN: %lldb %t -o "target variable s" -o exit | FileCheck %s + +# CHECK: (lldb) target variable s +# CHECK-NEXT: (Struct) s = (member = 47) + + .data + .p2align 2, 0x0 + .long 0 +s: + .long 47 # 0x2f + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 65 # DW_TAG_type_unit + .byte 1 # DW_CHILDREN_yes + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 1 # DW_CHILDREN_yes + .byte 54 # DW_AT_calling_convention + .byte 11 # DW_FORM_data1 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 13 # DW_TAG_member + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 56 # DW_AT_data_member_location + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 0 # DW_CHILDREN_no + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 105 # DW_AT_signature + .byte 32 # DW_FORM_ref_sig8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + + .section .debug_info,"",@progbits +.Ltu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 2 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad 4878254330033667422 # Type Signature + .long .LStruct_def-.Ltu_begin0 # Type DIE Offset + .byte 1 # Abbrev [1] 0x18:0x20 DW_TAG_type_unit + .short 33 # DW_AT_language +.LStruct_def: + .byte 2 # Abbrev [2] 0x23:0x10 DW_TAG_structure_type + .byte 5 # DW_AT_calling_convention + .long .Linfo_string6 # DW_AT_name + .byte 4 # DW_AT_byte_size + .byte 3 # Abbrev [3] 0x29:0x9 DW_TAG_member + .long .Linfo_string4 # DW_AT_name + .long .Lint-.Ltu_begin0 # DW_AT_type + .byte 0 # DW_AT_data_member_location + .byte 0 # End Of Children Mark +.Lint: + .byte 4 # Abbrev [4] 0x33:0x4 DW_TAG_base_type + .long .Linfo_string5 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + +.Lcu_begin0: + .long .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit +.Ldebug_info_start1: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 5 # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .short 33 # DW_AT_language +.Ls: + .byte 6 # Abbrev [6] 0x1e:0xb DW_TAG_variable + .long .Linfo_string3 # DW_AT_name + .long .LStruct_decl2-.Lcu_begin0 # DW_AT_type + .byte 9 # DW_AT_location + .byte 3 + .quad s +.LStruct_decl2: + .byte 8 # Abbrev [8] 0x29:0x9 DW_TAG_structure_type + .long .Linfo_string6 # DW_AT_name + # DW_AT_declaration + .byte 0 # End Of Children Mark +.Ldebug_info_end1: + +.Lcu_begin1: + .long .Ldebug_info_end2-.Ldebug_info_start2 # Length of Unit +.Ldebug_info_start2: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 5 # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .short 33 # DW_AT_language +.LStruct_decl: + .byte 7 # Abbrev [7] 0x29:0x9 DW_TAG_structure_type + # DW_AT_declaration + .quad 4878254330033667422 # DW_AT_signature + .byte 0 # End Of Children Mark +.Ldebug_info_end2: + + .section .debug_str,"MS",@progbits,1 +.Linfo_string3: + .asciz "s" # string offset=60 +.Linfo_string4: + .asciz "member" # string offset=62 +.Linfo_string5: + .asciz "int" # string offset=69 +.Linfo_string6: + .asciz "Struct" # string offset=73 + + .section .debug_names,"",@progbits + .long .Lnames_end0-.Lnames_start0 # Header: unit length +.Lnames_start0: + .short 5 # Header: version + .short 0 # Header: padding + .long 2 # Header: compilation unit count + .long 1 # Header: local type unit count + .long 0 # Header: foreign type unit count + .long 0 # Header: bucket count + .long 3 # Header: name count + .long .Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size + .long 8 # Header: augmentation string size + .ascii "LLVM0700" # Header: augmentation string + .long .Lcu_begin0 # Compilation unit 0 + .long .Lcu_begin1 # Compilation unit 1 + .long .Ltu_begin0 # Type unit 0 + .long .Linfo_string6 # String in Bucket 0: Struct + .long .Linfo_string3 # String in Bucket 1: s + .long .Linfo_string5 # String in Bucket 2: int + .long .Lnames1-.Lnames_entries0 # Offset in Bucket 0 + .long .Lnames2-.Lnames_entries0 # Offset in Bucket 1 + .long .Lnames0-.Lnames_entries0 # Offset in Bucket 2 +.Lnames_abbrev_start0: + .byte 1 # Abbrev code + .byte 19 # DW_TAG_structure_type + .byte 2 # DW_IDX_type_unit + .byte 11 # DW_FORM_data1 + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 2 # Abbrev code + .byte 52 # DW_TAG_variable + .byte 1 # DW_IDX_compile_unit + .byte 11 # DW_FORM_data1 + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 3 # Abbrev code + .byte 36 # DW_TAG_base_type + .byte 2 # DW_IDX_type_unit + .byte 11 # DW_FORM_data1 + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 4 # Abbrev code + .byte 19 # DW_TAG_structure_type + .byte 1 # DW_IDX_compile_unit + .byte 11 # DW_FORM_data1 + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 0 # End of abbrev list +.Lnames_abbrev_end0: +.Lnames_entries0: +.Lnames1: + .byte 4 # Abbreviation code + .byte 1 # DW_IDX_compile_unit + .long .LStruct_decl-.Lcu_begin1 # DW_IDX_die_offset + .byte 1 # Abbreviation code + .byte 0 # DW_IDX_type_unit + .long .LStruct_def-.Ltu_begin0 # DW_IDX_die_offset + .byte 0 + # End of list: Struct +.Lnames2: + .byte 2 # Abbreviation code + .byte 0 # DW_IDX_compile_unit + .long .Ls-.Lcu_begin0 # DW_IDX_die_offset + .byte 0 + # End of list: s +.Lnames0: + .byte 3 # Abbreviation code + .byte 0 # DW_IDX_type_unit + .long .Lint-.Ltu_begin0 # DW_IDX_die_offset + .byte 0 + # End of list: int + .p2align 2, 0x0 +.Lnames_end0: diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s new file mode 100644 index 00000000000000..9cb534207c3d13 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s @@ -0,0 +1,169 @@ +## Check that lldb can locate a static constant variable when its declaration is +## referenced by a debug_names index. This is a non-conforming extension used by +## dsymutil. + +# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t +# RUN: %lldb %t -o "target variable Class::constant" \ +# RUN: -o "expr -l c++ -- Class::constant" -o exit | FileCheck %s + +# CHECK: (lldb) target variable Class::constant +# CHECK-NEXT: (const int) Class::constant = 47 +# CHECK: (lldb) expr -l c++ -- Class::constant +# CHECK-NEXT: (const int) $0 = 47 + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 8 # DW_FORM_string + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 2 # DW_TAG_class_type + .byte 1 # DW_CHILDREN_yes + .byte 54 # DW_AT_calling_convention + .byte 11 # DW_FORM_data1 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 28 # DW_AT_const_value + .byte 13 # DW_FORM_sdata + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 38 # DW_TAG_const_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 1 # Abbrev [1] 0xc:0x40 DW_TAG_compile_unit + .asciz "Hand-written DWARF" # DW_AT_producer + .short 33 # DW_AT_language +.LClass: + .byte 3 # Abbrev [3] 0x29:0x10 DW_TAG_class_type + .byte 5 # DW_AT_calling_convention + .long .Linfo_string4 # DW_AT_name + .byte 1 # DW_AT_byte_size +.Lvariable: + .byte 4 # Abbrev [4] 0x2f:0x9 DW_TAG_variable + .long .Linfo_string5 # DW_AT_name + .long .Lconst_int-.Lcu_begin0 # DW_AT_type + # DW_AT_external + # DW_AT_declaration + .byte 47 # DW_AT_const_value + .byte 0 # End Of Children Mark +.Lconst_int: + .byte 5 # Abbrev [5] 0x39:0x5 DW_TAG_const_type + .long .Lint-.Lcu_begin0 # DW_AT_type +.Lint: + .byte 6 # Abbrev [6] 0x3e:0x4 DW_TAG_base_type + .long .Linfo_string6 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + + .section .debug_str,"MS",@progbits,1 +.Linfo_string4: + .asciz "Class" +.Linfo_string5: + .asciz "constant" +.Linfo_string6: + .asciz "int" + + .section .debug_names,"",@progbits + .long .Lnames_end0-.Lnames_start0 # Header: unit length +.Lnames_start0: + .short 5 # Header: version + .short 0 # Header: padding + .long 1 # Header: compilation unit count + .long 0 # Header: local type unit count + .long 0 # Header: foreign type unit count + .long 0 # Header: bucket count + .long 3 # Header: name count + .long .Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size + .long 8 # Header: augmentation string size + .ascii "LLVM0700" # Header: augmentation string + .long .Lcu_begin0 # Compilation unit 0 + .long .Linfo_string4 # String: Class + .long .Linfo_string5 # String: constant + .long .Linfo_string6 # String: int + .long .Lnames0-.Lnames_entries0 + .long .Lnames3-.Lnames_entries0 + .long .Lnames1-.Lnames_entries0 +.Lnames_abbrev_start0: + .byte 1 # Abbrev code + .byte 2 # DW_TAG_class_type + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 2 # Abbrev code + .byte 52 # DW_TAG_variable + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 3 # Abbrev code + .byte 36 # DW_TAG_base_type + .byte 3 # DW_IDX_die_offset + .byte 19 # DW_FORM_ref4 + .byte 0 # End of abbrev + .byte 0 # End of abbrev + .byte 0 # End of abbrev list +.Lnames_abbrev_end0: +.Lnames_entries0: +.Lnames0: + .byte 1 # Abbreviation code + .long .LClass-.Lcu_begin0 # DW_IDX_die_offset + .byte 0 # DW_IDX_parent + # End of list: Class +.Lnames3: + .byte 2 # Abbreviation code + .long .Lvariable-.Lcu_begin0 # DW_IDX_die_offset + .byte 0 # DW_IDX_parent + # End of list: constant +.Lnames1: + .byte 3 # Abbreviation code + .long .Lint-.Lcu_begin0 # DW_IDX_die_offset + .byte 0 # DW_IDX_parent + # End of list: int +.Lnames_end0: From 1df37980c296ab33e96038c1daa06d580ae8b925 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Jun 2024 14:56:33 +0100 Subject: [PATCH 78/82] [X86] early-ifcvt-remarks.ll - use i64 arithmetic to ensure ifcvt doesn't drop below threshold Upcoming SimplifyDemandedBits support for CMOV will simplify the code and reduce the critical path below the threshold if we stick with i32 multiplies --- llvm/test/CodeGen/X86/early-ifcvt-remarks.ll | 36 ++++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll b/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll index 85c9aadbf70481..054485a358066e 100644 --- a/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll +++ b/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll @@ -68,24 +68,24 @@ if.else: ret i32 %res } -define i32 @mm3(i1 %pred, i32 %val, i32 %e1, i128 %e2, i128 %e3, i128 %e4, i128 %e5) { +define i64 @mm3(i1 %pred, i64 %val, i64 %e1, i128 %e2, i128 %e3, i128 %e4, i128 %e5) { ; CHECK-LABEL: mm3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: movl %esi, %r10d +; CHECK-NEXT: movq %rsi, %r10 ; CHECK-NEXT: jne .LBB2_2 ; CHECK-NEXT: # %bb.1: # %if.false ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: imull %edx, %edx -; CHECK-NEXT: movslq %edx, %r10 -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: movl %edx, %r9d -; CHECK-NEXT: mulq %r10 -; CHECK-NEXT: imulq %r10, %r8 +; CHECK-NEXT: imulq %rdx, %rdx +; CHECK-NEXT: movq %rdx, %r10 ; CHECK-NEXT: sarq $63, %r10 +; CHECK-NEXT: movq %rcx, %rax +; CHECK-NEXT: movq %rdx, %r9 +; CHECK-NEXT: mulq %rdx ; CHECK-NEXT: imulq %rcx, %r10 +; CHECK-NEXT: imulq %r9, %r8 ; CHECK-NEXT: addq %rdx, %r8 ; CHECK-NEXT: addq %r10, %r8 ; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rax @@ -106,9 +106,9 @@ define i32 @mm3(i1 %pred, i32 %val, i32 %e1, i128 %e2, i128 %e3, i128 %e4, i128 ; CHECK-NEXT: orq %rdi, %r10 ; CHECK-NEXT: testb $64, %sil ; CHECK-NEXT: cmovneq %rax, %r10 -; CHECK-NEXT: movl %r9d, %eax +; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: .LBB2_2: # %if.endif -; CHECK-NEXT: addl %r10d, %eax +; CHECK-NEXT: addq %r10, %rax ; CHECK-NEXT: retq entry: br i1 %pred, label %if.true, label %if.false @@ -117,19 +117,19 @@ if.true: br label %if.endif if.false: - %f1 = mul i32 %e1, %e1 - %f3 = sext i32 %f1 to i128 + %f1 = mul i64 %e1, %e1 + %f3 = sext i64 %f1 to i128 %f4 = mul i128 %e2, %f3 %f6 = add i128 %e3, %f4 %f7 = xor i128 %e4, %f6 %f8 = xor i128 %e5, %f7 %a1 = ashr i128 %f8, %e5 - %f5 = trunc i128 %a1 to i32 + %f5 = trunc i128 %a1 to i64 br label %if.endif if.endif: - %r1 = phi i32 [ %val, %if.true ], [ %f1, %if.false ] - %r2 = phi i32 [ %val, %if.true ], [ %f5, %if.false ] - %res = add i32 %r1, %r2 - ret i32 %res + %r1 = phi i64 [ %val, %if.true ], [ %f1, %if.false ] + %r2 = phi i64 [ %val, %if.true ], [ %f5, %if.false ] + %res = add i64 %r1, %r2 + ret i64 %res } From 464eb648fa33c46f51f7c3d1f4ab3eef7ec12750 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Jun 2024 15:16:20 +0100 Subject: [PATCH 79/82] [X86] SimplifyDemandedBitsForTargetNode - add basic X86ISD::CMOV handling Add basic pass through handling - we could extend this to truncate CMOVQ to CMOVL in a future patch --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++++ .../CodeGen/X86/widen-load-of-small-alloca.ll | 54 +++++++++---------- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2ed79385272fa8..cad3ea4716db3e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42793,6 +42793,19 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1, AssumeSingleUse); } + case X86ISD::CMOV: { + KnownBits Known2; + if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits, + OriginalDemandedElts, Known2, TLO, Depth + 1)) + return true; + if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits, + OriginalDemandedElts, Known, TLO, Depth + 1)) + return true; + + // Only known if known in both the LHS and RHS. + Known = Known.intersectWith(Known2); + break; + } case X86ISD::BEXTR: case X86ISD::BEXTRI: { SDValue Op0 = Op.getOperand(0); diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 929671d674e5e3..4a47e7613dfa6d 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -543,10 +543,10 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil @@ -573,19 +573,19 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: ; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %al, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: @@ -651,10 +651,10 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil @@ -681,19 +681,19 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: ; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %ax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: @@ -758,10 +758,10 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leal (%rax,%rax), %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orl %edi, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NO-SHLD-NEXT: testb $64, %sil @@ -788,19 +788,19 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: ; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (%rcx,%rcx), %r8d ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: testb $64, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rdi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: cmoveq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: From be6248a42a39bb21e54764c48ddc3804b9b93ae5 Mon Sep 17 00:00:00 2001 From: Dmitry Chernenkov Date: Tue, 11 Jun 2024 14:26:18 +0000 Subject: [PATCH 80/82] [Bazel] Layering fix for 65310f34d7edf7924ca4cbe7df836770669f70dc --- utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel index fac692addb9e5a..9f8e217bbd08be 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel @@ -378,6 +378,7 @@ libc_support_library( name = "rint_test_template", hdrs = ["RIntTest.h"], deps = [ + "//libc:__support_cpp_algorithm", "//libc:__support_fputil_fenv_impl", "//libc:__support_fputil_fp_bits", "//libc:hdr_fenv_macros", @@ -416,6 +417,7 @@ libc_support_library( name = "round_to_integer_test_template", hdrs = ["RoundToIntegerTest.h"], deps = [ + "//libc:__support_cpp_algorithm", "//libc:__support_fputil_fenv_impl", "//libc:__support_fputil_fp_bits", "//libc:hdr_math_macros", From d58637219463924185614f18911c5f01a1c20aa9 Mon Sep 17 00:00:00 2001 From: Max191 <44243577+Max191@users.noreply.github.com> Date: Tue, 11 Jun 2024 07:31:06 -0700 Subject: [PATCH 81/82] [mlir] Add bufferization option for parallel region check (#94645) Handling parallel region RaW conflicts should usually be the responsibility of the source program, rather than bufferization analysis. However, to preserve current functionality, checks on parallel regions is put behind a bufferization in this PR, which is on by default. Default functionality will not change, but this PR enables the option to leave parallelism checks out of the bufferization analysis. --- .../IR/BufferizableOpInterface.h | 5 +++++ .../TransformOps/BufferizationTransformOps.td | 1 + .../Bufferization/Transforms/Passes.td | 2 ++ .../Bufferization/Transforms/Bufferize.cpp | 1 + .../Transforms/OneShotAnalysis.cpp | 2 +- .../SCF/one-shot-bufferize-analysis.mlir | 21 ++++++++++++------- 6 files changed, 24 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index 2d8add82383bef..2fda091e412aef 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -309,6 +309,11 @@ struct BufferizationOptions { /// bufferized or not. bool bufferizeFunctionBoundaries = false; + // Specifies whether to account for parallel regions in RaW analysis. If true, + // then writes inside of parallel regions that write to buffers defined + // outside of the parallel region will be given a new buffer. + bool checkParallelRegions = true; + /// Certain ops have aliasing OpOperand/OpResult invariants (e.g., scf.for). /// If this flag is set to `false`, those invariants are no longer enforced /// with buffer copies. diff --git a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td index 5ace9c390e146e..53b3b0505b3992 100644 --- a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td +++ b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td @@ -88,6 +88,7 @@ def OneShotBufferizeOp DefaultValuedAttr:$dump_alias_sets, DefaultValuedAttr:$test_analysis_only, DefaultValuedAttr:$print_conflicts, + DefaultValuedAttr:$check_parallel_regions, DefaultValuedAttr:$memcpy_op); let results = (outs TransformHandleTypeInterface:$transformed); diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td index 8f8826b9ad56b4..1cece818dbbbc3 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -498,6 +498,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> { Option<"bufferizeFunctionBoundaries", "bufferize-function-boundaries", "bool", /*default=*/"0", "Bufferize function boundaries (experimental).">, + Option<"checkParallelRegions", "check-parallel-regions", "bool", + /*default=*/"true", "Account for parallel regions in RaW analysis.">, Option<"copyBeforeWrite", "copy-before-write", "bool", /*default=*/"false", "Skip the analysis. Make a buffer copy on every write.">, ListOption<"dialectFilter", "dialect-filter", "std::string", diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index 0fddd60eb8140e..e422086c9fde6d 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -226,6 +226,7 @@ struct OneShotBufferizePass opt.printConflicts = printConflicts; opt.testAnalysisOnly = testAnalysisOnly; opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; + opt.checkParallelRegions = checkParallelRegions; opt.noAnalysisFuncFilter = noAnalysisFuncFilter; // Configure type converter. diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp index 2d329a1f3d8892..d0b4e0dd4383ea 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp @@ -611,7 +611,7 @@ hasReadAfterWriteInterference(const DenseSet &usesRead, // Before going through the main RaW analysis, find cases where a buffer must // be privatized due to parallelism. If the result of a write is never read, // privatization is not necessary (and large parts of the IR are likely dead). - if (!usesRead.empty()) { + if (options.checkParallelRegions && !usesRead.empty()) { for (OpOperand *uConflictingWrite : usesWrite) { // Find the allocation point or last write (definition) of the buffer. // Note: In contrast to `findDefinitions`, this also returns results of diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir index 4d82021e86f5bb..9bb87ffbb20905 100644 --- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s --check-prefixes=CHECK,PARALLEL-CHECK +// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only check-parallel-regions=false" -split-input-file | FileCheck %s --check-prefixes=CHECK,NO-PARALLEL-CHECK // Run fuzzer with different seeds. // RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null @@ -811,8 +812,10 @@ func.func @parallel_region() -> tensor<320xf32> %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { %val = "test.foo"() : () -> (f32) // linalg.fill must bufferize out-of-place because every thread needs a - // private copy of %alloc1. - // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // private copy of %alloc1. If not accounting for parallel regions, the fill + // can bufferize in place. + // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} @@ -841,8 +844,10 @@ func.func @parallel_region_mixed_def(%c: i1) -> tensor<320xf32> } %val = "test.foo"() : () -> (f32) // linalg.fill must bufferize out-of-place because every thread needs a - // private copy of %alloc1. - // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // private copy of %alloc1. If not accounting for parallel regions, the fill + // can bufferize in place. + // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} %fill = linalg.fill ins(%val : f32) outs(%selected : tensor<1xf32>) -> tensor<1xf32> scf.forall.in_parallel { // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]} @@ -866,8 +871,10 @@ func.func @parallel_region_two_writes(%f: f32) -> tensor<320xf32> %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) { %val = "test.foo"() : () -> (f32) // linalg.fill must bufferize out-of-place because every thread needs a - // private copy of %alloc1. - // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // private copy of %alloc1. If not accounting for parallel regions, the fill + // can bufferize in place. + // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]} + // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32> // CHECK: tensor.insert // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"] From 38ccee00346300c87abc34860398bc950c65eaec Mon Sep 17 00:00:00 2001 From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com> Date: Tue, 11 Jun 2024 10:43:51 -0400 Subject: [PATCH 82/82] [WASM] Fix for wasi libc build break add tan to RuntimeLibcallSignatureTable (#95082) The wasm backend fetches the tan runtime lib call in `llvm/include/llvm/IR/RuntimeLibcalls.def` via `StaticLibcallNameMap()`, but ignores the runtime function because a function sinature mapping is not specified in RuntimeLibcallSignatureTable(). The fix is to specify the function signatures for float32-128. This is a fix for a build break reported on PR https://github.com/llvm/llvm-project/pull/94559#issuecomment-2159923215. --- .../WebAssemblyRuntimeLibcallSignatures.cpp | 3 + llvm/test/CodeGen/WebAssembly/libcalls.ll | 69 ++++++++++--------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index d9936557776ba1..20e50c8c9e1ae0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -201,6 +201,9 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::COS_F32] = f32_func_f32; Table[RTLIB::COS_F64] = f64_func_f64; Table[RTLIB::COS_F128] = i64_i64_func_i64_i64; + Table[RTLIB::TAN_F32] = f32_func_f32; + Table[RTLIB::TAN_F64] = f64_func_f64; + Table[RTLIB::TAN_F128] = i64_i64_func_i64_i64; Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR; Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR; Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR; diff --git a/llvm/test/CodeGen/WebAssembly/libcalls.ll b/llvm/test/CodeGen/WebAssembly/libcalls.ll index 4f57c347a1a335..70f000664d388a 100644 --- a/llvm/test/CodeGen/WebAssembly/libcalls.ll +++ b/llvm/test/CodeGen/WebAssembly/libcalls.ll @@ -12,6 +12,7 @@ declare fp128 @llvm.nearbyint.f128(fp128) declare fp128 @llvm.pow.f128(fp128, fp128) declare fp128 @llvm.powi.f128.i32(fp128, i32) +declare double @llvm.tan.f64(double) declare double @llvm.cos.f64(double) declare double @llvm.log10.f64(double) declare double @llvm.pow.f64(double, double) @@ -240,42 +241,44 @@ define double @f64libcalls(double %x, double %y, i32 %z) { ; CHECK: .functype f64libcalls (f64, f64, i32) -> (f64) ; CHECK-NEXT: .local i32 ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: global.get $push11=, __stack_pointer -; CHECK-NEXT: i32.const $push12=, 16 -; CHECK-NEXT: i32.sub $push18=, $pop11, $pop12 -; CHECK-NEXT: local.tee $push17=, 3, $pop18 -; CHECK-NEXT: global.set __stack_pointer, $pop17 -; CHECK-NEXT: local.get $push22=, 0 -; CHECK-NEXT: local.get $push19=, 0 -; CHECK-NEXT: call $push0=, cos, $pop19 -; CHECK-NEXT: call $push1=, log10, $pop0 -; CHECK-NEXT: local.get $push20=, 1 -; CHECK-NEXT: call $push2=, pow, $pop1, $pop20 -; CHECK-NEXT: local.get $push21=, 2 -; CHECK-NEXT: call $push3=, __powidf2, $pop2, $pop21 -; CHECK-NEXT: call $push4=, log, $pop3 -; CHECK-NEXT: call $push5=, exp, $pop4 -; CHECK-NEXT: call $push6=, exp10, $pop5 -; CHECK-NEXT: call $push7=, cbrt, $pop6 -; CHECK-NEXT: call $push8=, lround, $pop7 -; CHECK-NEXT: call $push9=, ldexp, $pop22, $pop8 -; CHECK-NEXT: local.get $push23=, 3 -; CHECK-NEXT: i32.const $push15=, 12 -; CHECK-NEXT: i32.add $push16=, $pop23, $pop15 -; CHECK-NEXT: call $push24=, frexp, $pop9, $pop16 -; CHECK-NEXT: local.set 0, $pop24 -; CHECK-NEXT: local.get $push25=, 3 -; CHECK-NEXT: i32.load $push10=, 12($pop25) -; CHECK-NEXT: call escape_value, $pop10 -; CHECK-NEXT: local.get $push26=, 3 +; CHECK-NEXT: global.get $push12=, __stack_pointer ; CHECK-NEXT: i32.const $push13=, 16 -; CHECK-NEXT: i32.add $push14=, $pop26, $pop13 -; CHECK-NEXT: global.set __stack_pointer, $pop14 -; CHECK-NEXT: local.get $push27=, 0 -; CHECK-NEXT: return $pop27 +; CHECK-NEXT: i32.sub $push19=, $pop12, $pop13 +; CHECK-NEXT: local.tee $push18=, 3, $pop19 +; CHECK-NEXT: global.set __stack_pointer, $pop18 +; CHECK-NEXT: local.get $push23=, 0 +; CHECK-NEXT: local.get $push20=, 0 +; CHECK-NEXT: call $push0=, tan, $pop20 +; CHECK-NEXT: call $push1=, cos, $pop0 +; CHECK-NEXT: call $push2=, log10, $pop1 +; CHECK-NEXT: local.get $push21=, 1 +; CHECK-NEXT: call $push3=, pow, $pop2, $pop21 +; CHECK-NEXT: local.get $push22=, 2 +; CHECK-NEXT: call $push4=, __powidf2, $pop3, $pop22 +; CHECK-NEXT: call $push5=, log, $pop4 +; CHECK-NEXT: call $push6=, exp, $pop5 +; CHECK-NEXT: call $push7=, exp10, $pop6 +; CHECK-NEXT: call $push8=, cbrt, $pop7 +; CHECK-NEXT: call $push9=, lround, $pop8 +; CHECK-NEXT: call $push10=, ldexp, $pop23, $pop9 +; CHECK-NEXT: local.get $push24=, 3 +; CHECK-NEXT: i32.const $push16=, 12 +; CHECK-NEXT: i32.add $push17=, $pop24, $pop16 +; CHECK-NEXT: call $push25=, frexp, $pop10, $pop17 +; CHECK-NEXT: local.set 0, $pop25 +; CHECK-NEXT: local.get $push26=, 3 +; CHECK-NEXT: i32.load $push11=, 12($pop26) +; CHECK-NEXT: call escape_value, $pop11 +; CHECK-NEXT: local.get $push27=, 3 +; CHECK-NEXT: i32.const $push14=, 16 +; CHECK-NEXT: i32.add $push15=, $pop27, $pop14 +; CHECK-NEXT: global.set __stack_pointer, $pop15 +; CHECK-NEXT: local.get $push28=, 0 +; CHECK-NEXT: return $pop28 - %a = call double @llvm.cos.f64(double %x) + %k = call double @llvm.tan.f64(double %x) + %a = call double @llvm.cos.f64(double %k) %b = call double @llvm.log10.f64(double %a) %c = call double @llvm.pow.f64(double %b, double %y) %d = call double @llvm.powi.f64.i32(double %c, i32 %z)