From db93ef14aef9c572e02bc842762bc4d0278148f9 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Wed, 18 Dec 2024 10:44:42 +0100 Subject: [PATCH 01/37] [Clang] Implement CWG2813: Class member access with prvalues (#120223) This is a rebase of #95112 with my own feedback apply as @MitalAshok has been inactive for a while. It's fairly important this makes clang 20 as it is a blocker for #107451 --- [CWG2813](https://cplusplus.github.io/CWG/issues/2813.html) prvalue.member_fn(expression-list) now will not materialize a temporary for prvalue if member_fn is an explicit object member function, and prvalue will bind directly to the object parameter. The E1 in E1.static_member is now a discarded-value expression, so if E1 was a call to a [[nodiscard]] function, there will now be a warning. This also affects C++98 with [[gnu::warn_unused_result]] functions. This should not affect C where TemporaryMaterializationConversion is a no-op. Closes #100314 Fixes #100341 --------- Co-authored-by: Mital Ashok --- .../clangd/unittests/DumpASTTests.cpp | 41 +++++- clang/docs/ReleaseNotes.rst | 5 + clang/include/clang/Sema/Sema.h | 5 + clang/lib/AST/Expr.cpp | 3 + clang/lib/Sema/SemaExprMember.cpp | 67 ++++++++-- clang/lib/Sema/SemaOverload.cpp | 11 +- clang/lib/Sema/SemaStmt.cpp | 117 +++++++++++------- .../dcl.attr/dcl.attr.nodiscard/p2.cpp | 48 +++++++ clang/test/CXX/drs/cwg28xx.cpp | 20 ++- clang/test/CodeGenCXX/cxx2b-deducing-this.cpp | 1 - clang/test/SemaCXX/cxx2b-deducing-this.cpp | 20 +++ clang/test/SemaCXX/ms-property.cpp | 42 ++++++- clang/www/cxx_dr_status.html | 2 +- 13 files changed, 306 insertions(+), 76 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/DumpASTTests.cpp b/clang-tools-extra/clangd/unittests/DumpASTTests.cpp index 304682118c871d..cb2c17ad4ef0d9 100644 --- a/clang-tools-extra/clangd/unittests/DumpASTTests.cpp +++ b/clang-tools-extra/clangd/unittests/DumpASTTests.cpp @@ -49,7 +49,7 @@ declaration: Function - root )"}, {R"cpp( namespace root { -struct S { static const int x = 0; }; +struct S { static const int x = 0; ~S(); }; int y = S::x + root::S().x; } )cpp", @@ -60,10 +60,12 @@ declaration: Namespace - root type: Qualified - const type: Builtin - int expression: IntegerLiteral - 0 + declaration: CXXDestructor + type: Record - S + type: FunctionProto + type: Builtin - void declaration: CXXConstructor declaration: CXXConstructor - declaration: CXXConstructor - declaration: CXXDestructor declaration: Var - y type: Builtin - int expression: ExprWithCleanups @@ -74,7 +76,7 @@ declaration: Namespace - root type: Record - S expression: ImplicitCast - LValueToRValue expression: Member - x - expression: MaterializeTemporary - rvalue + expression: CXXBindTemporary expression: CXXTemporaryObject - S type: Elaborated specifier: Namespace - root:: @@ -82,6 +84,37 @@ declaration: Namespace - root )"}, {R"cpp( namespace root { +struct S { static const int x = 0; }; +int y = S::x + root::S().x; +} + )cpp", + R"( +declaration: Namespace - root + declaration: CXXRecord - S + declaration: Var - x + type: Qualified - const + type: Builtin - int + expression: IntegerLiteral - 0 + declaration: CXXConstructor + declaration: CXXConstructor + declaration: CXXConstructor + declaration: CXXDestructor + declaration: Var - y + type: Builtin - int + expression: BinaryOperator - + + expression: ImplicitCast - LValueToRValue + expression: DeclRef - x + specifier: TypeSpec + type: Record - S + expression: ImplicitCast - LValueToRValue + expression: Member - x + expression: CXXTemporaryObject - S + type: Elaborated + specifier: Namespace - root:: + type: Record - S + )"}, + {R"cpp( +namespace root { template int tmpl() { (void)tmpl(); return T::value; diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 408b2800f9e79c..956b5532b48f65 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -321,6 +321,11 @@ Resolutions to C++ Defect Reports - Fix name lookup for a dependent base class that is the current instantiation. (`CWG591: When a dependent base class is the current instantiation `_). +- Clang now allows calling explicit object member functions directly with prvalues + instead of always materializing a temporary, meaning by-value explicit object parameters + do not need to move from a temporary. + (`CWG2813: Class member access with prvalues `_). + C Language Changes ------------------ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index ae07ed8478f2a8..5ee7ea48cc983c 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10659,6 +10659,11 @@ class Sema final : public SemaBase { SourceLocation EndLoc); void ActOnForEachDeclStmt(DeclGroupPtrTy Decl); + /// DiagnoseDiscardedExprMarkedNodiscard - Given an expression that is + /// semantically a discarded-value expression, diagnose if any [[nodiscard]] + /// value has been discarded. + void DiagnoseDiscardedExprMarkedNodiscard(const Expr *E); + /// DiagnoseUnusedExprResult - If the statement passed in is an expression /// whose result is unused, warn. void DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID); diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 5a6738196d2890..8c8ccdb61dc01c 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -2990,6 +2990,9 @@ bool Expr::isUnusedResultAWarning(const Expr *&WarnE, SourceLocation &Loc, case ExprWithCleanupsClass: return cast(this)->getSubExpr() ->isUnusedResultAWarning(WarnE, Loc, R1, R2, Ctx); + case OpaqueValueExprClass: + return cast(this)->getSourceExpr()->isUnusedResultAWarning( + WarnE, Loc, R1, R2, Ctx); } } diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index 85d5dfcb3db6de..bcc1b92ffdec73 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -1003,15 +1003,6 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, : !isDependentScopeSpecifier(SS) || computeDeclContext(SS)) && "dependent lookup context that isn't the current instantiation?"); - // C++1z [expr.ref]p2: - // For the first option (dot) the first expression shall be a glvalue [...] - if (!IsArrow && BaseExpr && BaseExpr->isPRValue()) { - ExprResult Converted = TemporaryMaterializationConversion(BaseExpr); - if (Converted.isInvalid()) - return ExprError(); - BaseExpr = Converted.get(); - } - const DeclarationNameInfo &MemberNameInfo = R.getLookupNameInfo(); DeclarationName MemberName = MemberNameInfo.getName(); SourceLocation MemberLoc = MemberNameInfo.getLoc(); @@ -1128,26 +1119,68 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, BaseExpr = BuildCXXThisExpr(Loc, BaseExprType, /*IsImplicit=*/true); } + // C++17 [expr.ref]p2, per CWG2813: + // For the first option (dot), if the id-expression names a static member or + // an enumerator, the first expression is a discarded-value expression; if + // the id-expression names a non-static data member, the first expression + // shall be a glvalue. + auto ConvertBaseExprToDiscardedValue = [&] { + assert(getLangOpts().CPlusPlus && + "Static member / member enumerator outside of C++"); + if (IsArrow) + return false; + ExprResult Converted = IgnoredValueConversions(BaseExpr); + if (Converted.isInvalid()) + return true; + BaseExpr = Converted.get(); + DiagnoseDiscardedExprMarkedNodiscard(BaseExpr); + return false; + }; + auto ConvertBaseExprToGLValue = [&] { + if (IsArrow || !BaseExpr->isPRValue()) + return false; + ExprResult Converted = TemporaryMaterializationConversion(BaseExpr); + if (Converted.isInvalid()) + return true; + BaseExpr = Converted.get(); + return false; + }; + // Check the use of this member. if (DiagnoseUseOfDecl(MemberDecl, MemberLoc)) return ExprError(); - if (FieldDecl *FD = dyn_cast(MemberDecl)) + if (FieldDecl *FD = dyn_cast(MemberDecl)) { + if (ConvertBaseExprToGLValue()) + return ExprError(); return BuildFieldReferenceExpr(BaseExpr, IsArrow, OpLoc, SS, FD, FoundDecl, MemberNameInfo); + } - if (MSPropertyDecl *PD = dyn_cast(MemberDecl)) + if (MSPropertyDecl *PD = dyn_cast(MemberDecl)) { + // No temporaries are materialized for property references yet. + // They might be materialized when this is transformed into a member call. + // Note that this is slightly different behaviour from MSVC which doesn't + // implement CWG2813 yet: MSVC might materialize an extra temporary if the + // getter or setter function is an explicit object member function. return BuildMSPropertyRefExpr(*this, BaseExpr, IsArrow, SS, PD, MemberNameInfo); + } - if (IndirectFieldDecl *FD = dyn_cast(MemberDecl)) + if (IndirectFieldDecl *FD = dyn_cast(MemberDecl)) { + if (ConvertBaseExprToGLValue()) + return ExprError(); // We may have found a field within an anonymous union or struct // (C++ [class.union]). return BuildAnonymousStructUnionMemberReference(SS, MemberLoc, FD, FoundDecl, BaseExpr, OpLoc); + } + // Static data member if (VarDecl *Var = dyn_cast(MemberDecl)) { + if (ConvertBaseExprToDiscardedValue()) + return ExprError(); return BuildMemberExpr(BaseExpr, IsArrow, OpLoc, SS.getWithLocInContext(Context), TemplateKWLoc, Var, FoundDecl, /*HadMultipleCandidates=*/false, @@ -1161,7 +1194,13 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, if (MemberFn->isInstance()) { valueKind = VK_PRValue; type = Context.BoundMemberTy; + if (MemberFn->isImplicitObjectMemberFunction() && + ConvertBaseExprToGLValue()) + return ExprError(); } else { + // Static member function + if (ConvertBaseExprToDiscardedValue()) + return ExprError(); valueKind = VK_LValue; type = MemberFn->getType(); } @@ -1174,6 +1213,8 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, assert(!isa(MemberDecl) && "member function not C++ method?"); if (EnumConstantDecl *Enum = dyn_cast(MemberDecl)) { + if (ConvertBaseExprToDiscardedValue()) + return ExprError(); return BuildMemberExpr( BaseExpr, IsArrow, OpLoc, SS.getWithLocInContext(Context), TemplateKWLoc, Enum, FoundDecl, /*HadMultipleCandidates=*/false, @@ -1181,6 +1222,8 @@ Sema::BuildMemberReferenceExpr(Expr *BaseExpr, QualType BaseExprType, } if (VarTemplateDecl *VarTempl = dyn_cast(MemberDecl)) { + if (ConvertBaseExprToDiscardedValue()) + return ExprError(); if (!TemplateArgs) { diagnoseMissingTemplateArguments( SS, /*TemplateKeyword=*/TemplateKWLoc.isValid(), VarTempl, MemberLoc); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 3dabe362802c90..fff49b759c935e 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -5933,7 +5933,9 @@ ExprResult Sema::PerformImplicitObjectArgumentInitialization( DestType = ImplicitParamRecordType; FromClassification = From->Classify(Context); - // When performing member access on a prvalue, materialize a temporary. + // CWG2813 [expr.call]p6: + // If the function is an implicit object member function, the object + // expression of the class member access shall be a glvalue [...] if (From->isPRValue()) { From = CreateMaterializeTemporaryExpr(FromRecordType, From, Method->getRefQualifier() != @@ -6464,11 +6466,6 @@ static Expr *GetExplicitObjectExpr(Sema &S, Expr *Obj, VK_LValue, OK_Ordinary, SourceLocation(), /*CanOverflow=*/false, FPOptionsOverride()); } - if (Obj->Classify(S.getASTContext()).isPRValue()) { - Obj = S.CreateMaterializeTemporaryExpr( - ObjType, Obj, - !Fun->getParamDecl(0)->getType()->isRValueReferenceType()); - } return Obj; } @@ -15584,8 +15581,6 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE, CurFPFeatureOverrides(), Proto->getNumParams()); } else { // Convert the object argument (for a non-static member function call). - // We only need to do this if there was actually an overload; otherwise - // it was done at lookup. ExprResult ObjectArg = PerformImplicitObjectArgumentInitialization( MemExpr->getBase(), Qualifier, FoundDecl, Method); if (ObjectArg.isInvalid()) diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 0e5c6cd49dccad..d9149f7ee40bbf 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -226,17 +226,18 @@ static bool DiagnoseNoDiscard(Sema &S, const NamedDecl *OffendingDecl, return S.Diag(Loc, diag::warn_unused_result) << A << true << Msg << R1 << R2; } -void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { - if (const LabelStmt *Label = dyn_cast_or_null(S)) - return DiagnoseUnusedExprResult(Label->getSubStmt(), DiagID); +namespace { - const Expr *E = dyn_cast_or_null(S); - if (!E) - return; +// Diagnoses unused expressions that call functions marked [[nodiscard]], +// [[gnu::warn_unused_result]] and similar. +// Additionally, a DiagID can be provided to emit a warning in additional +// contexts (such as for an unused LHS of a comma expression) +void DiagnoseUnused(Sema &S, const Expr *E, std::optional DiagID) { + bool NoDiscardOnly = !DiagID.has_value(); // If we are in an unevaluated expression context, then there can be no unused // results because the results aren't expected to be used in the first place. - if (isUnevaluatedContext()) + if (S.isUnevaluatedContext()) return; SourceLocation ExprLoc = E->IgnoreParenImpCasts()->getExprLoc(); @@ -245,30 +246,31 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { // expression is a call to a function with the warn_unused_result attribute, // we warn no matter the location. Because of the order in which the various // checks need to happen, we factor out the macro-related test here. - bool ShouldSuppress = - SourceMgr.isMacroBodyExpansion(ExprLoc) || - SourceMgr.isInSystemMacro(ExprLoc); + bool ShouldSuppress = S.SourceMgr.isMacroBodyExpansion(ExprLoc) || + S.SourceMgr.isInSystemMacro(ExprLoc); const Expr *WarnExpr; SourceLocation Loc; SourceRange R1, R2; - if (!E->isUnusedResultAWarning(WarnExpr, Loc, R1, R2, Context)) - return; - - // If this is a GNU statement expression expanded from a macro, it is probably - // unused because it is a function-like macro that can be used as either an - // expression or statement. Don't warn, because it is almost certainly a - // false positive. - if (isa(E) && Loc.isMacroID()) + if (!E->isUnusedResultAWarning(WarnExpr, Loc, R1, R2, S.Context)) return; - // Check if this is the UNREFERENCED_PARAMETER from the Microsoft headers. - // That macro is frequently used to suppress "unused parameter" warnings, - // but its implementation makes clang's -Wunused-value fire. Prevent this. - if (isa(E->IgnoreImpCasts()) && Loc.isMacroID()) { - SourceLocation SpellLoc = Loc; - if (findMacroSpelling(SpellLoc, "UNREFERENCED_PARAMETER")) + if (!NoDiscardOnly) { + // If this is a GNU statement expression expanded from a macro, it is + // probably unused because it is a function-like macro that can be used as + // either an expression or statement. Don't warn, because it is almost + // certainly a false positive. + if (isa(E) && Loc.isMacroID()) return; + + // Check if this is the UNREFERENCED_PARAMETER from the Microsoft headers. + // That macro is frequently used to suppress "unused parameter" warnings, + // but its implementation makes clang's -Wunused-value fire. Prevent this. + if (isa(E->IgnoreImpCasts()) && Loc.isMacroID()) { + SourceLocation SpellLoc = Loc; + if (S.findMacroSpelling(SpellLoc, "UNREFERENCED_PARAMETER")) + return; + } } // Okay, we have an unused result. Depending on what the base expression is, @@ -279,7 +281,7 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { if (const CXXBindTemporaryExpr *TempExpr = dyn_cast(E)) E = TempExpr->getSubExpr(); - if (DiagnoseUnusedComparison(*this, E)) + if (DiagnoseUnusedComparison(S, E)) return; E = WarnExpr; @@ -293,8 +295,8 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { if (E->getType()->isVoidType()) return; - auto [OffendingDecl, A] = CE->getUnusedResultAttr(Context); - if (DiagnoseNoDiscard(*this, OffendingDecl, + auto [OffendingDecl, A] = CE->getUnusedResultAttr(S.Context); + if (DiagnoseNoDiscard(S, OffendingDecl, cast_or_null(A), Loc, R1, R2, /*isCtor=*/false)) return; @@ -307,11 +309,11 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { if (ShouldSuppress) return; if (FD->hasAttr()) { - Diag(Loc, diag::warn_unused_call) << R1 << R2 << "pure"; + S.Diag(Loc, diag::warn_unused_call) << R1 << R2 << "pure"; return; } if (FD->hasAttr()) { - Diag(Loc, diag::warn_unused_call) << R1 << R2 << "const"; + S.Diag(Loc, diag::warn_unused_call) << R1 << R2 << "const"; return; } } @@ -323,15 +325,15 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { OffendingDecl = Ctor->getParent(); A = OffendingDecl->getAttr(); } - if (DiagnoseNoDiscard(*this, OffendingDecl, A, Loc, R1, R2, + if (DiagnoseNoDiscard(S, OffendingDecl, A, Loc, R1, R2, /*isCtor=*/true)) return; } } else if (const auto *ILE = dyn_cast(E)) { if (const TagDecl *TD = ILE->getType()->getAsTagDecl()) { - if (DiagnoseNoDiscard(*this, TD, TD->getAttr(), Loc, - R1, R2, /*isCtor=*/false)) + if (DiagnoseNoDiscard(S, TD, TD->getAttr(), Loc, R1, + R2, /*isCtor=*/false)) return; } } else if (ShouldSuppress) @@ -339,23 +341,24 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { E = WarnExpr; if (const ObjCMessageExpr *ME = dyn_cast(E)) { - if (getLangOpts().ObjCAutoRefCount && ME->isDelegateInitCall()) { - Diag(Loc, diag::err_arc_unused_init_message) << R1; + if (S.getLangOpts().ObjCAutoRefCount && ME->isDelegateInitCall()) { + S.Diag(Loc, diag::err_arc_unused_init_message) << R1; return; } const ObjCMethodDecl *MD = ME->getMethodDecl(); if (MD) { - if (DiagnoseNoDiscard(*this, nullptr, MD->getAttr(), - Loc, R1, R2, /*isCtor=*/false)) + if (DiagnoseNoDiscard(S, nullptr, MD->getAttr(), + Loc, R1, R2, + /*isCtor=*/false)) return; } } else if (const PseudoObjectExpr *POE = dyn_cast(E)) { const Expr *Source = POE->getSyntacticForm(); // Handle the actually selected call of an OpenMP specialized call. - if (LangOpts.OpenMP && isa(Source) && + if (S.LangOpts.OpenMP && isa(Source) && POE->getNumSemanticExprs() == 1 && isa(POE->getSemanticExpr(0))) - return DiagnoseUnusedExprResult(POE->getSemanticExpr(0), DiagID); + return DiagnoseUnused(S, POE->getSemanticExpr(0), DiagID); if (isa(Source)) DiagID = diag::warn_unused_container_subscript_expr; else if (isa(Source)) @@ -372,17 +375,21 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { if (!RD->getAttr()) return; } + + if (NoDiscardOnly) + return; + // Diagnose "(void*) blah" as a typo for "(void) blah". - else if (const CStyleCastExpr *CE = dyn_cast(E)) { + if (const CStyleCastExpr *CE = dyn_cast(E)) { TypeSourceInfo *TI = CE->getTypeInfoAsWritten(); QualType T = TI->getType(); // We really do want to use the non-canonical type here. - if (T == Context.VoidPtrTy) { + if (T == S.Context.VoidPtrTy) { PointerTypeLoc TL = TI->getTypeLoc().castAs(); - Diag(Loc, diag::warn_unused_voidptr) - << FixItHint::CreateRemoval(TL.getStarLoc()); + S.Diag(Loc, diag::warn_unused_voidptr) + << FixItHint::CreateRemoval(TL.getStarLoc()); return; } } @@ -391,16 +398,34 @@ void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { // isn't an array. if (E->isGLValue() && E->getType().isVolatileQualified() && !E->getType()->isArrayType()) { - Diag(Loc, diag::warn_unused_volatile) << R1 << R2; + S.Diag(Loc, diag::warn_unused_volatile) << R1 << R2; return; } // Do not diagnose use of a comma operator in a SFINAE context because the // type of the left operand could be used for SFINAE, so technically it is // *used*. - if (DiagID != diag::warn_unused_comma_left_operand || !isSFINAEContext()) - DiagIfReachable(Loc, S ? llvm::ArrayRef(S) : llvm::ArrayRef(), - PDiag(DiagID) << R1 << R2); + if (DiagID == diag::warn_unused_comma_left_operand && S.isSFINAEContext()) + return; + + S.DiagIfReachable(Loc, llvm::ArrayRef(E), + S.PDiag(*DiagID) << R1 << R2); +} +} // namespace + +void Sema::DiagnoseDiscardedExprMarkedNodiscard(const Expr *E) { + DiagnoseUnused(*this, E, std::nullopt); +} + +void Sema::DiagnoseUnusedExprResult(const Stmt *S, unsigned DiagID) { + if (const LabelStmt *Label = dyn_cast_if_present(S)) + S = Label->getSubStmt(); + + const Expr *E = dyn_cast_if_present(S); + if (!E) + return; + + DiagnoseUnused(*this, E, DiagID); } void Sema::ActOnStartOfCompoundStmt(bool IsStmtExpr) { diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp index da1f8201f55dcc..18f4bd5e9c0fae 100644 --- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp +++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp @@ -1,6 +1,7 @@ // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify=expected,cxx11,cxx11-17 -pedantic %s // RUN: %clang_cc1 -fsyntax-only -std=c++17 -verify=expected,cxx11-17,since-cxx17 -pedantic %s // RUN: %clang_cc1 -fsyntax-only -std=c++20 -verify=expected,since-cxx17 -pedantic %s +// RUN: %clang_cc1 -fsyntax-only -std=c++23 -verify=expected,since-cxx17 -pedantic %s struct [[nodiscard]] S {}; // cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} @@ -134,3 +135,50 @@ void usage() { static_cast(s); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute: Don't throw away as a double}} } } // namespace p1771 + +namespace discarded_member_access { +struct X { + union { + int variant_member; + }; + struct { // expected-warning {{anonymous structs are a GNU extension}} + int anonymous_struct_member; + }; + int data_member; + static int static_data_member; + enum { + unscoped_enum + }; + enum class scoped_enum_t { + scoped_enum + }; + using enum scoped_enum_t; + // cxx11-17-warning@-1 {{using enum declaration is a C++20 extension}} + + void implicit_object_member_function(); + static void static_member_function(); +#if __cplusplus >= 202302L + void explicit_object_member_function(this X self); +#endif +}; + +[[nodiscard]] X get_X(); +// cxx11-warning@-1 {{use of the 'nodiscard' attribute is a C++17 extension}} +void f() { + (void) get_X().variant_member; + (void) get_X().anonymous_struct_member; + (void) get_X().data_member; + (void) get_X().static_data_member; + // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}} + (void) get_X().unscoped_enum; + // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}} + (void) get_X().scoped_enum; + // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}} + (void) get_X().implicit_object_member_function(); + (void) get_X().static_member_function(); + // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}} +#if __cplusplus >= 202302L + (void) get_X().explicit_object_member_function(); +#endif +} +} // namespace discarded_member_access diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp index 9796607a790ce3..ff625a4a985bcc 100644 --- a/clang/test/CXX/drs/cwg28xx.cpp +++ b/clang/test/CXX/drs/cwg28xx.cpp @@ -30,7 +30,25 @@ using U2 = decltype(&main); #endif } // namespace cwg2811 -namespace cwg2819 { // cwg2819: 19 +namespace cwg2813 { // cwg2813: 20 +#if __cplusplus >= 202302L +struct X { + X() = default; + + X(const X&) = delete; + X& operator=(const X&) = delete; + + void f(this X self) { } +}; + +void f() { + X{}.f(); +} +#endif +} // namespace cwg2813 + +namespace cwg2819 { // cwg2819: 19 tentatively ready 2023-12-01 + #if __cpp_constexpr >= 202306L constexpr void* p = nullptr; constexpr int* q = static_cast(p); diff --git a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp index 1c8835a3986ea0..8a78463d3a4955 100644 --- a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp +++ b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp @@ -31,7 +31,6 @@ void test_lambda() { //CHECK: define dso_local void @{{.*}}test_lambda{{.*}}() #0 { //CHECK: entry: //CHECK: %agg.tmp = alloca %class.anon, align 1 -//CHECK: %ref.tmp = alloca %class.anon, align 1 //CHECK: %call = call noundef i32 @"_ZZ11test_lambdavENH3$_0clIS_EEiT_"() //CHECK: ret void //CHECK: } diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp index 520052a89d1840..6f17ce72754560 100644 --- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp +++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp @@ -437,6 +437,10 @@ namespace std { constexpr strong_ordering strong_ordering::equal = {0}; constexpr strong_ordering strong_ordering::greater = {1}; constexpr strong_ordering strong_ordering::less = {-1}; + + template constexpr __remove_reference_t(T)&& move(T&& t) noexcept { + return static_cast<__remove_reference_t(T)&&>(t); + } } namespace operators_deduction { @@ -965,6 +969,22 @@ void f(); void a::f(this auto) {} // expected-error {{an explicit object parameter cannot appear in a non-member function}} } +namespace GH100341 { +struct X { + X() = default; + X(X&&) = default; + void operator()(this X); +}; + +void fail() { + X()(); + [x = X{}](this auto) {}(); +} +void pass() { + std::move(X())(); + std::move([x = X{}](this auto) {})(); +} +} // namespace GH100341 struct R { void f(this auto &&self, int &&r_value_ref) {} // expected-note {{candidate function template not viable: expects an rvalue for 2nd argument}} void g(int &&r_value_ref) { diff --git a/clang/test/SemaCXX/ms-property.cpp b/clang/test/SemaCXX/ms-property.cpp index 168987b2462233..d5799a8a4d3639 100644 --- a/clang/test/SemaCXX/ms-property.cpp +++ b/clang/test/SemaCXX/ms-property.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -ast-print -verify -triple=x86_64-pc-win32 -fms-compatibility %s -o - | FileCheck %s -// RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -emit-pch -o %t %s -// RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -include-pch %t -verify %s -ast-print -o - | FileCheck %s -// expected-no-diagnostics +// RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -emit-pch -o %t -verify %s +// RUN: %clang_cc1 -triple=x86_64-pc-win32 -fms-compatibility -include-pch %t %s -ast-print -o - | FileCheck %s +// RUN: %clang_cc1 -fdeclspec -fsyntax-only -verify %s -std=c++23 #ifndef HEADER #define HEADER @@ -85,4 +85,40 @@ int main(int argc, char **argv) { // CHECK-NEXT: return Test1::GetTest1()->X; return Test1::GetTest1()->X; } + +struct X { + int implicit_object_member_function() { return 0; } + static int static_member_function() { return 0; } + + __declspec(property(get=implicit_object_member_function)) int imp; + __declspec(property(get=static_member_function)) int st; + +#if __cplusplus >= 202302L + int explicit_object_member_function(this X self) { return 0; } + __declspec(property(get=explicit_object_member_function)) int exp; +#endif +}; + +[[nodiscard]] X get_x(); +void f() { + (void) get_x().imp; + (void) get_x().st; + // expected-warning@-1 {{ignoring return value of function declared with 'nodiscard' attribute}} +#if __cplusplus >= 202302L + (void) get_x().exp; +#endif +} + +#if __cplusplus >= 202302L +struct Y { + Y() = default; + Y(const Y&) = delete; + int explicit_object_member_function(this Y) { return 0; } + __declspec(property(get = explicit_object_member_function)) int prop; +}; +void g() { + (void) Y().prop; +} +#endif + #endif // HEADER diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index cdedbcbaa40722..386c57250b7db6 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -16726,7 +16726,7 @@

C++ defect report implementation status

2813 DRWP Class member access with prvalues - Unknown + Clang 20 2814 From 16c02df8caae7b03fef4bc56759c342e7ff42d8b Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 18 Dec 2024 09:45:04 +0000 Subject: [PATCH 02/37] [lldb] Add lldb/source/Host/posix/MainLoopPosix.cpp to git blame ignores --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 9ef0713ef8af14..30d9f6b883cebe 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -97,3 +97,6 @@ e80bc777749331e9519575f416c342f7626dd14d # NFC: clang-format test_demangle.pass.cpp but keep test "lines" d33bf2e9df578ff7e44fd22504d6ad5a122b7ee6 + +# [lldb][NFC] clang-format MainLoopPosix.cpp +66bdbfbaa08fa3d8e64a7fe136a8fb717f5cdbb7 From 1ee740a79620aa680f68d873d6a7b5cfa1df7b19 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Wed, 18 Dec 2024 09:46:45 +0000 Subject: [PATCH 03/37] [VFABI] Add support for vector functions that return struct types (#119000) This patch updates the `VFABIDemangler` to support vector functions that return struct types. For example, a vector variant of `sincos` that returns a vector of sine values and a vector of cosine values within a struct. This patch also adds some helpers for vectorizing types (including struct types). Some of these are used in the `VFABIDemangler`, and others will be used in subsequent patches, so this patch simply adds tests for them. --- llvm/include/llvm/Analysis/VectorUtils.h | 14 +- llvm/include/llvm/IR/VectorTypeUtils.h | 94 ++++++++++++++ llvm/lib/IR/CMakeLists.txt | 1 + llvm/lib/IR/VFABIDemangler.cpp | 21 ++- llvm/lib/IR/VectorTypeUtils.cpp | 54 ++++++++ llvm/unittests/IR/CMakeLists.txt | 1 + llvm/unittests/IR/VFABIDemanglerTest.cpp | 85 +++++++++++- llvm/unittests/IR/VectorTypeUtilsTest.cpp | 149 ++++++++++++++++++++++ 8 files changed, 399 insertions(+), 20 deletions(-) create mode 100644 llvm/include/llvm/IR/VectorTypeUtils.h create mode 100644 llvm/lib/IR/VectorTypeUtils.cpp create mode 100644 llvm/unittests/IR/VectorTypeUtilsTest.cpp diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index c1016dd7bdddbd..7f8a0c9c0af7be 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -18,6 +18,7 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/IR/Module.h" #include "llvm/IR/VFABIDemangler.h" +#include "llvm/IR/VectorTypeUtils.h" #include "llvm/Support/CheckedArithmetic.h" namespace llvm { @@ -127,19 +128,6 @@ namespace Intrinsic { typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. If -/// the incoming type is void, we return void. If the EC represents a -/// scalar, we return the scalar type. -inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { - if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) - return Scalar; - return VectorType::get(Scalar, EC); -} - -inline Type *ToVectorTy(Type *Scalar, unsigned VF) { - return ToVectorTy(Scalar, ElementCount::getFixed(VF)); -} - /// Identify if the intrinsic is trivially vectorizable. /// This method returns true if the intrinsic's argument types are all scalars /// for the scalar form of the intrinsic and all vectors (or scalars handled by diff --git a/llvm/include/llvm/IR/VectorTypeUtils.h b/llvm/include/llvm/IR/VectorTypeUtils.h new file mode 100644 index 00000000000000..f30bf9ee9240b0 --- /dev/null +++ b/llvm/include/llvm/IR/VectorTypeUtils.h @@ -0,0 +1,94 @@ +//===------- VectorTypeUtils.h - Vector type utility functions -*- C++ -*-====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_VECTORTYPEUTILS_H +#define LLVM_IR_VECTORTYPEUTILS_H + +#include "llvm/IR/DerivedTypes.h" + +namespace llvm { + +/// A helper function for converting Scalar types to vector types. If +/// the incoming type is void, we return void. If the EC represents a +/// scalar, we return the scalar type. +inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { + if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) + return Scalar; + return VectorType::get(Scalar, EC); +} + +inline Type *ToVectorTy(Type *Scalar, unsigned VF) { + return ToVectorTy(Scalar, ElementCount::getFixed(VF)); +} + +/// A helper for converting structs of scalar types to structs of vector types. +/// Note: +/// - If \p EC is scalar, \p StructTy is returned unchanged +/// - Only unpacked literal struct types are supported +Type *toVectorizedStructTy(StructType *StructTy, ElementCount EC); + +/// A helper for converting structs of vector types to structs of scalar types. +/// Note: Only unpacked literal struct types are supported. +Type *toScalarizedStructTy(StructType *StructTy); + +/// Returns true if `StructTy` is an unpacked literal struct where all elements +/// are vectors of matching element count. This does not include empty structs. +bool isVectorizedStructTy(StructType *StructTy); + +/// A helper for converting to vectorized types. For scalar types, this is +/// equivalent to calling `ToVectorTy`. For struct types, this returns a new +/// struct where each element type has been widened to a vector type. +/// Note: +/// - If the incoming type is void, we return void +/// - If \p EC is scalar, \p Ty is returned unchanged +/// - Only unpacked literal struct types are supported +inline Type *toVectorizedTy(Type *Ty, ElementCount EC) { + if (StructType *StructTy = dyn_cast(Ty)) + return toVectorizedStructTy(StructTy, EC); + return ToVectorTy(Ty, EC); +} + +/// A helper for converting vectorized types to scalarized (non-vector) types. +/// For vector types, this is equivalent to calling .getScalarType(). For struct +/// types, this returns a new struct where each element type has been converted +/// to a scalar type. Note: Only unpacked literal struct types are supported. +inline Type *toScalarizedTy(Type *Ty) { + if (StructType *StructTy = dyn_cast(Ty)) + return toScalarizedStructTy(StructTy); + return Ty->getScalarType(); +} + +/// Returns true if `Ty` is a vector type or a struct of vector types where all +/// vector types share the same VF. +inline bool isVectorizedTy(Type *Ty) { + if (StructType *StructTy = dyn_cast(Ty)) + return isVectorizedStructTy(StructTy); + return Ty->isVectorTy(); +} + +/// Returns the types contained in `Ty`. For struct types, it returns the +/// elements, all other types are returned directly. +inline ArrayRef getContainedTypes(Type *const &Ty) { + if (auto *StructTy = dyn_cast(Ty)) + return StructTy->elements(); + return ArrayRef(&Ty, 1); +} + +/// Returns the number of vector elements for a vectorized type. +inline ElementCount getVectorizedTypeVF(Type *Ty) { + assert(isVectorizedTy(Ty) && "expected vectorized type"); + return cast(getContainedTypes(Ty).front())->getElementCount(); +} + +inline bool isUnpackedStructLiteral(StructType *StructTy) { + return StructTy->isLiteral() && !StructTy->isPacked(); +} + +} // namespace llvm + +#endif diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 544f4ea9223d0e..5f6254b2313180 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -73,6 +73,7 @@ add_llvm_component_library(LLVMCore Value.cpp ValueSymbolTable.cpp VectorBuilder.cpp + VectorTypeUtils.cpp Verifier.cpp VFABIDemangler.cpp RuntimeLibcalls.cpp diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp index 897583084bf38c..62f96b10cea4ac 100644 --- a/llvm/lib/IR/VFABIDemangler.cpp +++ b/llvm/lib/IR/VFABIDemangler.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Module.h" +#include "llvm/IR/VectorTypeUtils.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include @@ -346,12 +347,20 @@ getScalableECFromSignature(const FunctionType *Signature, const VFISAKind ISA, // Also check the return type if not void. Type *RetTy = Signature->getReturnType(); if (!RetTy->isVoidTy()) { - std::optional ReturnEC = getElementCountForTy(ISA, RetTy); - // If we have an unknown scalar element type we can't find a reasonable VF. - if (!ReturnEC) + // If the return type is a struct, only allow unpacked struct literals. + StructType *StructTy = dyn_cast(RetTy); + if (StructTy && !isUnpackedStructLiteral(StructTy)) return std::nullopt; - if (ElementCount::isKnownLT(*ReturnEC, MinEC)) - MinEC = *ReturnEC; + + for (Type *RetTy : getContainedTypes(RetTy)) { + std::optional ReturnEC = getElementCountForTy(ISA, RetTy); + // If we have an unknown scalar element type we can't find a reasonable + // VF. + if (!ReturnEC) + return std::nullopt; + if (ElementCount::isKnownLT(*ReturnEC, MinEC)) + MinEC = *ReturnEC; + } } // The SVE Vector function call ABI bases the VF on the widest element types @@ -566,7 +575,7 @@ FunctionType *VFABI::createFunctionType(const VFInfo &Info, auto *RetTy = ScalarFTy->getReturnType(); if (!RetTy->isVoidTy()) - RetTy = VectorType::get(RetTy, VF); + RetTy = toVectorizedTy(RetTy, VF); return FunctionType::get(RetTy, VecTypes, false); } diff --git a/llvm/lib/IR/VectorTypeUtils.cpp b/llvm/lib/IR/VectorTypeUtils.cpp new file mode 100644 index 00000000000000..e6e265414a2b8e --- /dev/null +++ b/llvm/lib/IR/VectorTypeUtils.cpp @@ -0,0 +1,54 @@ +//===------- VectorTypeUtils.cpp - Vector type utility functions ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/VectorTypeUtils.h" +#include "llvm/ADT/SmallVectorExtras.h" + +using namespace llvm; + +/// A helper for converting structs of scalar types to structs of vector types. +/// Note: Only unpacked literal struct types are supported. +Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC) { + if (EC.isScalar()) + return StructTy; + assert(isUnpackedStructLiteral(StructTy) && + "expected unpacked struct literal"); + assert(all_of(StructTy->elements(), VectorType::isValidElementType) && + "expected all element types to be valid vector element types"); + return StructType::get( + StructTy->getContext(), + map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * { + return VectorType::get(ElTy, EC); + })); +} + +/// A helper for converting structs of vector types to structs of scalar types. +/// Note: Only unpacked literal struct types are supported. +Type *llvm::toScalarizedStructTy(StructType *StructTy) { + assert(isUnpackedStructLiteral(StructTy) && + "expected unpacked struct literal"); + return StructType::get( + StructTy->getContext(), + map_to_vector(StructTy->elements(), [](Type *ElTy) -> Type * { + return ElTy->getScalarType(); + })); +} + +/// Returns true if `StructTy` is an unpacked literal struct where all elements +/// are vectors of matching element count. This does not include empty structs. +bool llvm::isVectorizedStructTy(StructType *StructTy) { + if (!isUnpackedStructLiteral(StructTy)) + return false; + auto ElemTys = StructTy->elements(); + if (ElemTys.empty() || !ElemTys.front()->isVectorTy()) + return false; + ElementCount VF = cast(ElemTys.front())->getElementCount(); + return all_of(ElemTys, [&](Type *Ty) { + return Ty->isVectorTy() && cast(Ty)->getElementCount() == VF; + }); +} diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt index ed93ee547d2231..b3dfe3d72fd385 100644 --- a/llvm/unittests/IR/CMakeLists.txt +++ b/llvm/unittests/IR/CMakeLists.txt @@ -51,6 +51,7 @@ add_llvm_unittest(IRTests ValueMapTest.cpp ValueTest.cpp VectorBuilderTest.cpp + VectorTypeUtilsTest.cpp VectorTypesTest.cpp VerifierTest.cpp VFABIDemanglerTest.cpp diff --git a/llvm/unittests/IR/VFABIDemanglerTest.cpp b/llvm/unittests/IR/VFABIDemanglerTest.cpp index 07bff16df49335..e30e0f865f7199 100644 --- a/llvm/unittests/IR/VFABIDemanglerTest.cpp +++ b/llvm/unittests/IR/VFABIDemanglerTest.cpp @@ -40,7 +40,9 @@ class VFABIParserTest : public ::testing::Test { VFInfo Info; /// Reset the data needed for the test. void reset(const StringRef ScalarFTyStr) { - M = parseAssemblyString("declare void @dummy()", Err, Ctx); + M = parseAssemblyString("%dummy_named_struct = type { double, double }\n" + "declare void @dummy()", + Err, Ctx); EXPECT_NE(M.get(), nullptr) << "Loading an invalid module.\n " << Err.getMessage() << "\n"; Type *Ty = parseType(ScalarFTyStr, Err, *(M)); @@ -753,6 +755,87 @@ TEST_F(VFABIParserTest, ParseVoidReturnTypeSVE) { EXPECT_EQ(VectorName, "vector_foo"); } +TEST_F(VFABIParserTest, ParseWideStructReturnTypeSVE) { + EXPECT_TRUE( + invokeParser("_ZGVsMxv_foo(vector_foo)", "{double, double}(float)")); + EXPECT_EQ(ISA, VFISAKind::SVE); + EXPECT_TRUE(isMasked()); + ElementCount NXV2 = ElementCount::getScalable(2); + FunctionType *FTy = FunctionType::get( + StructType::get(VectorType::get(Type::getDoubleTy(Ctx), NXV2), + VectorType::get(Type::getDoubleTy(Ctx), NXV2)), + { + VectorType::get(Type::getFloatTy(Ctx), NXV2), + VectorType::get(Type::getInt1Ty(Ctx), NXV2), + }, + false); + EXPECT_EQ(getFunctionType(), FTy); + EXPECT_EQ(Parameters.size(), 2U); + EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector})); + EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::GlobalPredicate})); + EXPECT_EQ(VF, NXV2); + EXPECT_EQ(ScalarName, "foo"); + EXPECT_EQ(VectorName, "vector_foo"); +} + +TEST_F(VFABIParserTest, ParseWideStructMixedReturnTypeSVE) { + EXPECT_TRUE(invokeParser("_ZGVsMxv_foo(vector_foo)", "{float, i64}(float)")); + EXPECT_EQ(ISA, VFISAKind::SVE); + EXPECT_TRUE(isMasked()); + ElementCount NXV2 = ElementCount::getScalable(2); + FunctionType *FTy = FunctionType::get( + StructType::get(VectorType::get(Type::getFloatTy(Ctx), NXV2), + VectorType::get(Type::getInt64Ty(Ctx), NXV2)), + { + VectorType::get(Type::getFloatTy(Ctx), NXV2), + VectorType::get(Type::getInt1Ty(Ctx), NXV2), + }, + false); + EXPECT_EQ(getFunctionType(), FTy); + EXPECT_EQ(Parameters.size(), 2U); + EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector})); + EXPECT_EQ(Parameters[1], VFParameter({1, VFParamKind::GlobalPredicate})); + EXPECT_EQ(VF, NXV2); + EXPECT_EQ(ScalarName, "foo"); + EXPECT_EQ(VectorName, "vector_foo"); +} + +TEST_F(VFABIParserTest, ParseWideStructReturnTypeNEON) { + EXPECT_TRUE( + invokeParser("_ZGVnN4v_foo(vector_foo)", "{float, float}(float)")); + EXPECT_EQ(ISA, VFISAKind::AdvancedSIMD); + EXPECT_FALSE(isMasked()); + ElementCount V4 = ElementCount::getFixed(4); + FunctionType *FTy = FunctionType::get( + StructType::get(VectorType::get(Type::getFloatTy(Ctx), V4), + VectorType::get(Type::getFloatTy(Ctx), V4)), + { + VectorType::get(Type::getFloatTy(Ctx), V4), + }, + false); + EXPECT_EQ(getFunctionType(), FTy); + EXPECT_EQ(Parameters.size(), 1U); + EXPECT_EQ(Parameters[0], VFParameter({0, VFParamKind::Vector})); + EXPECT_EQ(VF, V4); + EXPECT_EQ(ScalarName, "foo"); + EXPECT_EQ(VectorName, "vector_foo"); +} + +TEST_F(VFABIParserTest, ParseUnsupportedStructReturnTypesSVE) { + // Struct with array element type. + EXPECT_FALSE( + invokeParser("_ZGVsMxv_foo(vector_foo)", "{double, [4 x float]}(float)")); + // Nested struct type. + EXPECT_FALSE( + invokeParser("_ZGVsMxv_foo(vector_foo)", "{{float, float}}(float)")); + // Packed struct type. + EXPECT_FALSE( + invokeParser("_ZGVsMxv_foo(vector_foo)", "<{double, float}>(float)")); + // Named struct type. + EXPECT_FALSE( + invokeParser("_ZGVsMxv_foo(vector_foo)", "%dummy_named_struct(float)")); +} + // Make sure we reject unsupported parameter types. TEST_F(VFABIParserTest, ParseUnsupportedElementTypeSVE) { EXPECT_FALSE(invokeParser("_ZGVsMxv_foo(vector_foo)", "void(i128)")); diff --git a/llvm/unittests/IR/VectorTypeUtilsTest.cpp b/llvm/unittests/IR/VectorTypeUtilsTest.cpp new file mode 100644 index 00000000000000..c77f183e921de4 --- /dev/null +++ b/llvm/unittests/IR/VectorTypeUtilsTest.cpp @@ -0,0 +1,149 @@ +//===------- VectorTypeUtilsTest.cpp - Vector utils tests -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/VectorTypeUtils.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/LLVMContext.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +class VectorTypeUtilsTest : public ::testing::Test {}; + +TEST(VectorTypeUtilsTest, TestToVectorizedTy) { + LLVMContext C; + + Type *ITy = Type::getInt32Ty(C); + Type *FTy = Type::getFloatTy(C); + Type *HomogeneousStructTy = StructType::get(FTy, FTy, FTy); + Type *MixedStructTy = StructType::get(FTy, ITy); + Type *VoidTy = Type::getVoidTy(C); + + for (ElementCount VF : + {ElementCount::getFixed(4), ElementCount::getScalable(2)}) { + Type *IntVec = toVectorizedTy(ITy, VF); + EXPECT_TRUE(isa(IntVec)); + EXPECT_EQ(IntVec, VectorType::get(ITy, VF)); + + Type *FloatVec = toVectorizedTy(FTy, VF); + EXPECT_TRUE(isa(FloatVec)); + EXPECT_EQ(FloatVec, VectorType::get(FTy, VF)); + + Type *WideHomogeneousStructTy = toVectorizedTy(HomogeneousStructTy, VF); + EXPECT_TRUE(isa(WideHomogeneousStructTy)); + EXPECT_TRUE( + cast(WideHomogeneousStructTy)->containsHomogeneousTypes()); + EXPECT_TRUE(cast(WideHomogeneousStructTy)->getNumElements() == + 3); + EXPECT_TRUE(cast(WideHomogeneousStructTy)->getElementType(0) == + VectorType::get(FTy, VF)); + + Type *WideMixedStructTy = toVectorizedTy(MixedStructTy, VF); + EXPECT_TRUE(isa(WideMixedStructTy)); + EXPECT_TRUE(cast(WideMixedStructTy)->getNumElements() == 2); + EXPECT_TRUE(cast(WideMixedStructTy)->getElementType(0) == + VectorType::get(FTy, VF)); + EXPECT_TRUE(cast(WideMixedStructTy)->getElementType(1) == + VectorType::get(ITy, VF)); + + EXPECT_EQ(toVectorizedTy(VoidTy, VF), VoidTy); + } + + ElementCount ScalarVF = ElementCount::getFixed(1); + for (Type *Ty : {ITy, FTy, HomogeneousStructTy, MixedStructTy, VoidTy}) { + EXPECT_EQ(toVectorizedTy(Ty, ScalarVF), Ty); + } +} + +TEST(VectorTypeUtilsTest, TestToScalarizedTy) { + LLVMContext C; + + Type *ITy = Type::getInt32Ty(C); + Type *FTy = Type::getFloatTy(C); + Type *HomogeneousStructTy = StructType::get(FTy, FTy, FTy); + Type *MixedStructTy = StructType::get(FTy, ITy); + Type *VoidTy = Type::getVoidTy(C); + + for (ElementCount VF : {ElementCount::getFixed(1), ElementCount::getFixed(4), + ElementCount::getScalable(2)}) { + for (Type *Ty : {ITy, FTy, HomogeneousStructTy, MixedStructTy, VoidTy}) { + // toScalarizedTy should be the inverse of toVectorizedTy. + EXPECT_EQ(toScalarizedTy(toVectorizedTy(Ty, VF)), Ty); + }; + } +} + +TEST(VectorTypeUtilsTest, TestGetContainedTypes) { + LLVMContext C; + + Type *ITy = Type::getInt32Ty(C); + Type *FTy = Type::getFloatTy(C); + Type *HomogeneousStructTy = StructType::get(FTy, FTy, FTy); + Type *MixedStructTy = StructType::get(FTy, ITy); + Type *VoidTy = Type::getVoidTy(C); + + EXPECT_EQ(getContainedTypes(ITy), ArrayRef({ITy})); + EXPECT_EQ(getContainedTypes(FTy), ArrayRef({FTy})); + EXPECT_EQ(getContainedTypes(VoidTy), ArrayRef({VoidTy})); + EXPECT_EQ(getContainedTypes(HomogeneousStructTy), + ArrayRef({FTy, FTy, FTy})); + EXPECT_EQ(getContainedTypes(MixedStructTy), ArrayRef({FTy, ITy})); +} + +TEST(VectorTypeUtilsTest, TestIsVectorizedTy) { + LLVMContext C; + + Type *ITy = Type::getInt32Ty(C); + Type *FTy = Type::getFloatTy(C); + Type *NarrowStruct = StructType::get(FTy, ITy); + Type *VoidTy = Type::getVoidTy(C); + + EXPECT_FALSE(isVectorizedTy(ITy)); + EXPECT_FALSE(isVectorizedTy(NarrowStruct)); + EXPECT_FALSE(isVectorizedTy(VoidTy)); + + ElementCount VF = ElementCount::getFixed(4); + EXPECT_TRUE(isVectorizedTy(toVectorizedTy(ITy, VF))); + EXPECT_TRUE(isVectorizedTy(toVectorizedTy(NarrowStruct, VF))); + + Type *MixedVFStruct = + StructType::get(VectorType::get(ITy, ElementCount::getFixed(2)), + VectorType::get(ITy, ElementCount::getFixed(4))); + EXPECT_FALSE(isVectorizedTy(MixedVFStruct)); + + // Currently only literals types are considered wide. + Type *NamedWideStruct = StructType::create("Named", VectorType::get(ITy, VF), + VectorType::get(ITy, VF)); + EXPECT_FALSE(isVectorizedTy(NamedWideStruct)); + + // Currently only unpacked types are considered wide. + Type *PackedWideStruct = StructType::get( + C, ArrayRef{VectorType::get(ITy, VF), VectorType::get(ITy, VF)}, + /*isPacked=*/true); + EXPECT_FALSE(isVectorizedTy(PackedWideStruct)); +} + +TEST(VectorTypeUtilsTest, TestGetVectorizedTypeVF) { + LLVMContext C; + + Type *ITy = Type::getInt32Ty(C); + Type *FTy = Type::getFloatTy(C); + Type *HomogeneousStructTy = StructType::get(FTy, FTy, FTy); + Type *MixedStructTy = StructType::get(FTy, ITy); + + for (ElementCount VF : + {ElementCount::getFixed(4), ElementCount::getScalable(2)}) { + for (Type *Ty : {ITy, FTy, HomogeneousStructTy, MixedStructTy}) { + EXPECT_EQ(getVectorizedTypeVF(toVectorizedTy(Ty, VF)), VF); + }; + } +} + +} // namespace From 0b4ee8d4ee4be78e90fd7c4dc4a8f05e6b1a091e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Dec 2024 09:48:38 +0000 Subject: [PATCH 04/37] [X86] combineKSHIFT - fold kshiftr(kshiftr/extract_subvector(X,C1),C2) --> kshiftr(X,C1+C2) (#115528) Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +- llvm/test/CodeGen/X86/avx512-bugfix-26264.ll | 16 +- .../CodeGen/X86/avx512-masked-memop-64-32.ll | 8 +- llvm/test/CodeGen/X86/pr33349.ll | 24 +- llvm/test/CodeGen/X86/pr34177.ll | 12 +- llvm/test/CodeGen/X86/vec_smulo.ll | 8 +- llvm/test/CodeGen/X86/vec_umulo.ll | 8 +- llvm/test/CodeGen/X86/vector-compress.ll | 8 +- .../CodeGen/X86/vector-replicaton-i1-mask.ll | 290 +++++++++--------- 9 files changed, 208 insertions(+), 189 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2571873dba8483..2479bc3fd8f080 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58706,11 +58706,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); - + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return DAG.getConstant(0, SDLoc(N), VT); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // Fold kshiftr(extract_subvector(X,C1),C2) + // --> extract_subvector(kshiftr(X,C1+C2),0) + // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2) + if (N->getOpcode() == X86ISD::KSHIFTR) { + SDLoc DL(N); + if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR || + N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) { + SDValue Src = N->getOperand(0).getOperand(0); + uint64_t Amt = N->getConstantOperandVal(1) + + N->getOperand(0).getConstantOperandVal(1); + EVT SrcVT = Src.getValueType(); + if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) { + SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src, + DAG.getTargetConstant(Amt, DL, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift, + DAG.getIntPtrConstant(0, DL)); + } + } + } + APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll index 537f42dd9c2c59..e0f3b6c4ec90a4 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll @@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double> ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 ; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k1, %k2 ; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2} +; AVX512BW-NEXT: kshiftrd $24, %k1, %k1 ; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) @@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0) ; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 ; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $8, %k1, %k2 ; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2} +; AVX512BW-NEXT: kshiftrd $24, %k1, %k1 ; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: retq %res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll index bd52b9cd41584c..f6e5986afac531 100644 --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double> ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} -; SKX-NEXT: kshiftrw $8, %k1, %k2 +; SKX-NEXT: kshiftrd $8, %k1, %k2 ; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} -; SKX-NEXT: kshiftrd $16, %k1, %k1 -; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1} -; SKX-NEXT: kshiftrw $8, %k1, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2} +; SKX-NEXT: kshiftrd $24, %k1, %k1 ; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} ; SKX-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll index 83d3a33572266f..c879cb9867ab29 100644 --- a/llvm/test/CodeGen/X86/pr33349.ll +++ b/llvm/test/CodeGen/X86/pr33349.ll @@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu" ; KNL-NEXT: fldz ; KNL-NEXT: fld %st(0) ; KNL-NEXT: fcmovne %st(2), %st -; KNL-NEXT: testb $2, %al -; KNL-NEXT: fld %st(1) -; KNL-NEXT: fcmovne %st(3), %st ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $1, %al +; KNL-NEXT: fld %st(1) +; KNL-NEXT: fcmovne %st(3), %st +; KNL-NEXT: testb $2, %al ; KNL-NEXT: fld %st(2) ; KNL-NEXT: fcmovne %st(4), %st -; KNL-NEXT: testb $2, %al +; KNL-NEXT: testb $8, %al ; KNL-NEXT: fxch %st(3) ; KNL-NEXT: fcmovne %st(4), %st ; KNL-NEXT: fstp %st(4) ; KNL-NEXT: fxch %st(3) +; KNL-NEXT: fstpt 30(%rdi) +; KNL-NEXT: fxch %st(1) ; KNL-NEXT: fstpt 10(%rdi) ; KNL-NEXT: fxch %st(1) ; KNL-NEXT: fstpt (%rdi) -; KNL-NEXT: fxch %st(1) -; KNL-NEXT: fstpt 30(%rdi) ; KNL-NEXT: fstpt 20(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu" ; SKX-NEXT: fldz ; SKX-NEXT: fld %st(0) ; SKX-NEXT: fcmovne %st(2), %st -; SKX-NEXT: testb $2, %al -; SKX-NEXT: fld %st(1) -; SKX-NEXT: fcmovne %st(3), %st ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testb $1, %al +; SKX-NEXT: fld %st(1) +; SKX-NEXT: fcmovne %st(3), %st +; SKX-NEXT: testb $2, %al ; SKX-NEXT: fld %st(2) ; SKX-NEXT: fcmovne %st(4), %st -; SKX-NEXT: testb $2, %al +; SKX-NEXT: testb $8, %al ; SKX-NEXT: fxch %st(3) ; SKX-NEXT: fcmovne %st(4), %st ; SKX-NEXT: fstp %st(4) ; SKX-NEXT: fxch %st(3) +; SKX-NEXT: fstpt 30(%rdi) +; SKX-NEXT: fxch %st(1) ; SKX-NEXT: fstpt 10(%rdi) ; SKX-NEXT: fxch %st(1) ; SKX-NEXT: fstpt (%rdi) -; SKX-NEXT: fxch %st(1) -; SKX-NEXT: fstpt 30(%rdi) ; SKX-NEXT: fstpt 20(%rdi) ; SKX-NEXT: retq bb: diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll index 29922c2ac1a716..5b2431eb214955 100644 --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 ; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $2, %al +; AVX512VL-NEXT: testb $8, %al ; AVX512VL-NEXT: fld1 ; AVX512VL-NEXT: fldz ; AVX512VL-NEXT: fld %st(0) ; AVX512VL-NEXT: fcmovne %st(2), %st -; AVX512VL-NEXT: testb $1, %al +; AVX512VL-NEXT: testb $2, %al ; AVX512VL-NEXT: fld %st(1) ; AVX512VL-NEXT: fcmovne %st(3), %st -; AVX512VL-NEXT: kmovd %k1, %eax -; AVX512VL-NEXT: testb $2, %al +; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(2) ; AVX512VL-NEXT: fcmovne %st(4), %st +; AVX512VL-NEXT: kmovd %k1, %eax ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fxch %st(3) ; AVX512VL-NEXT: fcmovne %st(4), %st @@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-NEXT: fstpt 10(%rdi) ; AVX512VL-NEXT: fxch %st(1) ; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 60(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt 20(%rdi) ; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt (%rdi) ; AVX512VL-NEXT: fadd %st, %st(0) -; AVX512VL-NEXT: fstpt 60(%rdi) -; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt 40(%rdi) %1 = icmp eq <4 x i64> , %a %2 = select <4 x i1> %1, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 22b5246443fa8a..7e081310c35be5 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 4d7d2573183e07..68c6ca93576b76 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1 -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index f8c076db65de94..17b98b5ebcaeae 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240 ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VL-NEXT: kshiftrq $48, %k1, %k3 ; AVX512VL-NEXT: kshiftrq $32, %k1, %k4 -; AVX512VL-NEXT: kshiftrd $16, %k4, %k3 -; AVX512VL-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VL-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp) -; AVX512VL-NEXT: kshiftrw $8, %k1, %k0 +; AVX512VL-NEXT: kshiftrq $8, %k1, %k0 ; AVX512VL-NEXT: kxorw %k0, %k1, %k0 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k5 ; AVX512VL-NEXT: kxorw %k5, %k0, %k0 @@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4) ; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z} ; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512VL-NEXT: kshiftrw $8, %k4, %k0 +; AVX512VL-NEXT: kshiftrq $40, %k1, %k0 ; AVX512VL-NEXT: kxorw %k0, %k4, %k0 ; AVX512VL-NEXT: kshiftrw $4, %k0, %k4 ; AVX512VL-NEXT: kxorw %k4, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 358b2a503df261..a8df418143f325 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -256,12 +256,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -277,12 +277,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -409,19 +409,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -444,19 +444,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -2605,12 +2605,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -2626,12 +2626,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -2753,19 +2753,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -2788,19 +2788,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -3000,33 +3000,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k3, %k4 ; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k3, %k4 ; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) @@ -3063,33 +3063,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k3, %k4 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k3, %k4 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) @@ -3309,14 +3309,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -3330,14 +3330,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -9338,12 +9338,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) @@ -9362,12 +9362,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) @@ -12938,12 +12938,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -12959,12 +12959,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) @@ -13088,19 +13088,19 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) @@ -13299,33 +13299,33 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} -; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-NEXT: kshiftrq $48, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} +; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-NEXT: kshiftrq $16, %k3, %k4 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-NEXT: kshiftrq $48, %k3, %k4 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} +; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx) @@ -13682,8 +13682,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 @@ -13691,73 +13691,73 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} ; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm15, %k2 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k2} {z} +; AVX512BW-NEXT: vpmovb2m %zmm16, %k2 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm16 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm16, %k1 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm15, %k1 +; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm15 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} ; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} ; AVX512BW-NEXT: vpmovb2m %zmm7, %k1 ; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm29 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm30 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 1920(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 1920(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1792(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 1792(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) @@ -13769,11 +13769,11 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1024(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) From 31239540b09bf5315b3a795160cf47d4c4edcd4e Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 18 Dec 2024 09:52:57 +0000 Subject: [PATCH 05/37] [gn build] Port 1ee740a79620 --- llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn index 7a05438441b64c..4ad22724d6225c 100644 --- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn @@ -89,6 +89,7 @@ static_library("IR") { "Value.cpp", "ValueSymbolTable.cpp", "VectorBuilder.cpp", + "VectorTypeUtils.cpp", "Verifier.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn index ccee5d79afdccd..e4ca566782e523 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn @@ -55,6 +55,7 @@ unittest("IRTests") { "ValueMapTest.cpp", "ValueTest.cpp", "VectorBuilderTest.cpp", + "VectorTypeUtilsTest.cpp", "VectorTypesTest.cpp", "VerifierTest.cpp", ] From 7e49ada9a3c0f8228c79de7f65d3255916087bb0 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 18 Dec 2024 14:03:20 +0400 Subject: [PATCH 06/37] [github/CODEOWNERS] Add yota9 as BOLT reviewer --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 098d36f1622052..ab8b75f415870d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -129,7 +129,7 @@ /mlir/**/Transforms/SROA.* @moxinilian # BOLT -/bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci +/bolt/ @aaupov @maksfb @rafaelauler @ayermolo @dcci @yota9 # Bazel build system. /utils/bazel/ @rupprecht @keith From f8d270474c14c6705c77971494505dbe4b6d55ae Mon Sep 17 00:00:00 2001 From: Vladi Krapp Date: Wed, 18 Dec 2024 10:10:51 +0000 Subject: [PATCH 07/37] [ARM] Reduce loop unroll when low overhead branching is available (#120065) For processors with low overhead branching (LOB), runtime unrolling the innermost loop is often detrimental to performance. In these cases the loop remainder gets unrolled into a series of compare-and-jump blocks, which in deeply nested loops get executed multiple times, negating the benefits of LOB. This is particularly noticable when the loop trip count of the innermost loop varies within the outer loop, such as in the case of triangular matrix decompositions. In these cases we will prefer to not unroll the innermost loop, with the intention for it to be executed as a low overhead loop. --- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 23 +++++++++++++++- .../Transforms/LoopUnroll/ARM/lob-unroll.ll | 27 ++++++++++++------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 0e29648a7a284f..639f3bf8fc62e3 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2592,11 +2592,32 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, return; } + // For processors with low overhead branching (LOB), runtime unrolling the + // innermost loop is often detrimental to performance. In these cases the loop + // remainder gets unrolled into a series of compare-and-jump blocks, which in + // deeply nested loops get executed multiple times, negating the benefits of + // LOB. This is particularly noticable when the loop trip count of the + // innermost loop varies within the outer loop, such as in the case of + // triangular matrix decompositions. In these cases we will prefer to not + // unroll the innermost loop, with the intention for it to be executed as a + // low overhead loop. + bool Runtime = true; + if (ST->hasLOB()) { + if (SE.hasLoopInvariantBackedgeTakenCount(L)) { + const auto *BETC = SE.getBackedgeTakenCount(L); + auto *Outer = L->getOutermostLoop(); + if ((L != Outer && Outer != L->getParentLoop()) || + (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) { + Runtime = false; + } + } + } + LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n"); UP.Partial = true; - UP.Runtime = true; + UP.Runtime = Runtime; UP.UnrollRemainder = true; UP.DefaultUnrollRuntimeCount = UnrollCount; UP.UnrollAndJam = true; diff --git a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll index b155f5d31045f9..111bc96b28806a 100644 --- a/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll +++ b/llvm/test/Transforms/LoopUnroll/ARM/lob-unroll.ll @@ -1,17 +1,23 @@ +; RUN: opt -mcpu=cortex-m7 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=NLOB ; RUN: opt -mcpu=cortex-m55 -mtriple=thumbv8.1m.main -passes=loop-unroll -S %s -o - | FileCheck %s --check-prefix=LOB ; This test checks behaviour of loop unrolling on processors with low overhead branching available -; LOB-CHECK-LABEL: for.body{{.*}}.prol -; LOB-COUNT-1: fmul fast float -; LOB-CHECK-LABEL: for.body{{.*}}.prol.1 -; LOB-COUNT-1: fmul fast float -; LOB-CHECK-LABEL: for.body{{.*}}.prol.2 -; LOB-COUNT-1: fmul fast float -; LOB-CHECK-LABEL: for.body{{.*}} -; LOB-COUNT-4: fmul fast float +; NLOB-LABEL: for.body{{.*}}.prol: +; NLOB-COUNT-1: fmul fast float +; NLOB-LABEL: for.body{{.*}}.prol.1: +; NLOB-COUNT-1: fmul fast float +; NLOB-LABEL: for.body{{.*}}.prol.2: +; NLOB-COUNT-1: fmul fast float +; NLOB-LABEL: for.body{{.*}}: +; NLOB-COUNT-4: fmul fast float +; NLOB-NOT: fmul fast float + +; LOB-LABEL: for.body{{.*}}: +; LOB: fmul fast float ; LOB-NOT: fmul fast float + ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) define dso_local void @test(i32 noundef %n, ptr nocapture noundef %pA) local_unnamed_addr #0 { entry: @@ -20,7 +26,7 @@ entry: for.cond.loopexit: ; preds = %for.cond6.for.cond.cleanup8_crit_edge.us, %for.body %exitcond49.not = icmp eq i32 %add, %n - br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body + br i1 %exitcond49.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 for.cond.cleanup: ; preds = %for.cond.loopexit, %entry ret void @@ -61,3 +67,6 @@ for.cond6.for.cond.cleanup8_crit_edge.us: ; preds = %for.body9.us br i1 %exitcond48.not, label %for.cond.loopexit, label %for.cond6.preheader.us } +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.unroll.disable"} From b3eede5e1fa7ab742b86e9be22db7bccd2505b8a Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 18 Dec 2024 10:34:26 +0000 Subject: [PATCH 08/37] Add support for single reductions in ComplexDeinterleavingPass (#112875) The Complex Deinterleaving pass assumes that all values emitted will result in complex numbers, this patch aims to remove that assumption and adds support for emitting just the real or imaginary components, not both. --- .../llvm/CodeGen/ComplexDeinterleavingPass.h | 2 + .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 278 +++- .../Target/AArch64/AArch64ISelLowering.cpp | 43 +- .../AArch64/complex-deinterleaving-cdot.ll | 1136 +++++++++++++++++ 4 files changed, 1434 insertions(+), 25 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h index 84a2673fecb5bf..4383249658e606 100644 --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -35,6 +35,7 @@ struct ComplexDeinterleavingPass enum class ComplexDeinterleavingOperation { CAdd, CMulPartial, + CDot, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. Deinterleave, @@ -43,6 +44,7 @@ enum class ComplexDeinterleavingOperation { ReductionPHI, ReductionOperation, ReductionSelect, + ReductionSingle }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index f3f7ea9407b46f..3111354addacd1 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -108,6 +108,13 @@ static bool isNeg(Value *V); static Value *getNegOperand(Value *V); namespace { +template +std::optional findCommonBetweenCollections(IterT A, IterT B) { + auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); }); + if (Common != A.end()) + return std::make_optional(*Common); + return std::nullopt; +} class ComplexDeinterleavingLegacyPass : public FunctionPass { public: @@ -144,6 +151,7 @@ struct ComplexDeinterleavingCompositeNode { friend class ComplexDeinterleavingGraph; using NodePtr = std::shared_ptr; using RawNodePtr = ComplexDeinterleavingCompositeNode *; + bool OperandsValid = true; public: ComplexDeinterleavingOperation Operation; @@ -160,7 +168,11 @@ struct ComplexDeinterleavingCompositeNode { SmallVector Operands; Value *ReplacementNode = nullptr; - void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } + void addOperand(NodePtr Node) { + if (!Node || !Node.get()) + OperandsValid = false; + Operands.push_back(Node.get()); + } void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { @@ -194,6 +206,8 @@ struct ComplexDeinterleavingCompositeNode { PrintNodeRef(Op); } } + + bool areOperandsValid() { return OperandsValid; } }; class ComplexDeinterleavingGraph { @@ -293,7 +307,7 @@ class ComplexDeinterleavingGraph { NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); - if (Node->Real && Node->Imag) + if (Node->Real) CachedResult[{Node->Real, Node->Imag}] = Node; return Node; } @@ -327,6 +341,8 @@ class ComplexDeinterleavingGraph { /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + NodePtr identifyPartialReduction(Value *R, Value *I); + NodePtr identifyDotProduct(Value *Inst); NodePtr identifyNode(Value *R, Value *I); @@ -396,6 +412,7 @@ class ComplexDeinterleavingGraph { /// * Deinterleave the final value outside of the loop and repurpose original /// reduction users void processReductionOperation(Value *OperationReplacement, RawNodePtr Node); + void processReductionSingle(Value *OperationReplacement, RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -891,17 +908,163 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { - LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n"); - assert(R->getType() == I->getType() && - "Real and imaginary parts should not have different types"); +ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CDot, V->getType())) { + LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving " + "operation CDot with the type " + << *V->getType() << "\n"); + return nullptr; + } + + auto *Inst = cast(V); + auto *RealUser = cast(*Inst->user_begin()); + + NodePtr CN = + prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr); + + NodePtr ANode; + + const Intrinsic::ID PartialReduceInt = + Intrinsic::experimental_vector_partial_reduce_add; + + Value *AReal = nullptr; + Value *AImag = nullptr; + Value *BReal = nullptr; + Value *BImag = nullptr; + Value *Phi = nullptr; + + auto UnwrapCast = [](Value *V) -> Value * { + if (auto *CI = dyn_cast(V)) + return CI->getOperand(0); + return V; + }; + + auto PatternRot0 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(AReal))), + m_Neg(m_Mul(m_Value(BImag), m_Value(AImag)))); + + auto PatternRot270 = m_Intrinsic( + m_Intrinsic( + m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))), + m_Mul(m_Value(BImag), m_Value(AReal))); + + if (match(Inst, PatternRot0)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_0; + } else if (match(Inst, PatternRot270)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_270; + } else { + Value *A0, *A1; + // The rotations 90 and 180 share the same operation pattern, so inspect the + // order of the operands, identifying where the real and imaginary + // components of A go, to discern between the aforementioned rotations. + auto PatternRot90Rot180 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(A0))), + m_Mul(m_Value(BImag), m_Value(A1))); + + if (!match(Inst, PatternRot90Rot180)) + return nullptr; + + A0 = UnwrapCast(A0); + A1 = UnwrapCast(A1); + + // Test if A0 is real/A1 is imag + ANode = identifyNode(A0, A1); + if (!ANode) { + // Test if A0 is imag/A1 is real + ANode = identifyNode(A1, A0); + // Unable to identify operand components, thus unable to identify rotation + if (!ANode) + return nullptr; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_90; + AReal = A1; + AImag = A0; + } else { + AReal = A0; + AImag = A1; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_180; + } + } + + AReal = UnwrapCast(AReal); + AImag = UnwrapCast(AImag); + BReal = UnwrapCast(BReal); + BImag = UnwrapCast(BImag); + + VectorType *VTy = cast(V->getType()); + Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2); + if (AReal->getType() != ExpectedOperandTy) + return nullptr; + if (AImag->getType() != ExpectedOperandTy) + return nullptr; + if (BReal->getType() != ExpectedOperandTy) + return nullptr; + if (BImag->getType() != ExpectedOperandTy) + return nullptr; + + if (Phi->getType() != VTy && RealUser->getType() != VTy) + return nullptr; + + NodePtr Node = identifyNode(AReal, AImag); + + // In the case that a node was identified to figure out the rotation, ensure + // that trying to identify a node with AReal and AImag post-unwrap results in + // the same node + if (ANode && Node != ANode) { + LLVM_DEBUG( + dbgs() + << "Identified node is different from previously identified node. " + "Unable to confidently generate a complex operation node\n"); + return nullptr; + } + + CN->addOperand(Node); + CN->addOperand(identifyNode(BReal, BImag)); + CN->addOperand(identifyNode(Phi, RealUser)); + + return submitCompositeNode(CN); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { + // Partial reductions don't support non-vector types, so check these first + if (!isa(R->getType()) || !isa(I->getType())) + return nullptr; + + auto CommonUser = + findCommonBetweenCollections(R->users(), I->users()); + if (!CommonUser) + return nullptr; + + auto *IInst = dyn_cast(*CommonUser); + if (!IInst || IInst->getIntrinsicID() != + Intrinsic::experimental_vector_partial_reduce_add) + return nullptr; + + if (NodePtr CN = identifyDotProduct(IInst)) + return CN; + return nullptr; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); return It->second; } + if (NodePtr CN = identifyPartialReduction(R, I)) + return CN; + + bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); + if (!IsReduction && R->getType() != I->getType()) + return nullptr; + if (NodePtr CN = identifySplat(R, I)) return CN; @@ -1427,12 +1590,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { if (It != RootToNode.end()) { auto RootNode = It->second; assert(RootNode->Operation == - ComplexDeinterleavingOperation::ReductionOperation); + ComplexDeinterleavingOperation::ReductionOperation || + RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle); // Find out which part, Real or Imag, comes later, and only if we come to // the latest part, add it to OrderedRoots. auto *R = cast(RootNode->Real); - auto *I = cast(RootNode->Imag); - auto *ReplacementAnchor = R->comesBefore(I) ? I : R; + auto *I = RootNode->Imag ? cast(RootNode->Imag) : nullptr; + + Instruction *ReplacementAnchor; + if (I) + ReplacementAnchor = R->comesBefore(I) ? I : R; + else + ReplacementAnchor = R; + if (ReplacementAnchor != RootI) return false; OrderedRoots.push_back(RootI); @@ -1523,7 +1694,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; - auto *Real = OperationInstruction[i]; auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) @@ -1556,6 +1726,28 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { break; } } + + auto *Real = OperationInstruction[i]; + // We want to check that we have 2 operands, but the function attributes + // being counted as operands bloats this value. + if (Real->getNumOperands() < 2) + continue; + + RealPHI = ReductionInfo[Real].first; + ImagPHI = nullptr; + PHIsFound = false; + auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); + if (Node && PHIsFound) { + LLVM_DEBUG( + dbgs() << "Identified single reduction starting from instruction: " + << *Real << "/" << *ReductionInfo[Real].second << "\n"); + Processed[i] = true; + auto RootNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); + RootNode->addOperand(Node); + RootToNode[Real] = RootNode; + submitCompositeNode(RootNode); + } } RealPHI = nullptr; @@ -1563,6 +1755,12 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { } bool ComplexDeinterleavingGraph::checkNodes() { + + for (NodePtr N : CompositeNodes) { + if (!N->areOperandsValid()) + return false; + } + // Collect all instructions from roots to leaves SmallPtrSet AllInstructions; SmallVector Worklist; @@ -1831,7 +2029,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, Instruction *Imag) { - if (Real != RealPHI || Imag != ImagPHI) + if (Real != RealPHI || (ImagPHI && Imag != ImagPHI)) return nullptr; PHIsFound = true; @@ -1926,6 +2124,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, Value *ReplacementNode; switch (Node->Operation) { + case ComplexDeinterleavingOperation::CDot: { + Value *Input0 = ReplaceOperandIfExist(Node, 0); + Value *Input1 = ReplaceOperandIfExist(Node, 1); + Value *Accumulator = ReplaceOperandIfExist(Node, 2); + assert(!Input1 || (Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type")); + ReplacementNode = TL->createComplexDeinterleavingIR( + Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + break; + } case ComplexDeinterleavingOperation::CAdd: case ComplexDeinterleavingOperation::CMulPartial: case ComplexDeinterleavingOperation::Symmetric: { @@ -1969,13 +2177,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. + auto *OldPHI = cast(Node->Real); auto *VTy = cast(Node->Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt()); - OldToNewPHI[dyn_cast(Node->Real)] = NewPHI; + OldToNewPHI[OldPHI] = NewPHI; ReplacementNode = NewPHI; break; } + case ComplexDeinterleavingOperation::ReductionSingle: + ReplacementNode = replaceNode(Builder, Node->Operands[0]); + processReductionSingle(ReplacementNode, Node); + break; case ComplexDeinterleavingOperation::ReductionOperation: ReplacementNode = replaceNode(Builder, Node->Operands[0]); processReductionOperation(ReplacementNode, Node); @@ -2000,6 +2213,38 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, return ReplacementNode; } +void ComplexDeinterleavingGraph::processReductionSingle( + Value *OperationReplacement, RawNodePtr Node) { + auto *Real = cast(Node->Real); + auto *OldPHI = ReductionInfo[Real].first; + auto *NewPHI = OldToNewPHI[OldPHI]; + auto *VTy = cast(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + Value *Init = OldPHI->getIncomingValueForBlock(Incoming); + + IRBuilder<> Builder(Incoming->getTerminator()); + + Value *NewInit = nullptr; + if (auto *C = dyn_cast(Init)) { + if (C->isZeroValue()) + NewInit = Constant::getNullValue(NewVTy); + } + + if (!NewInit) + NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, + {Init, Constant::getNullValue(VTy)}); + + NewPHI->addIncoming(NewInit, Incoming); + NewPHI->addIncoming(OperationReplacement, BackEdge); + + auto *FinalReduction = ReductionInfo[Real].second; + Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt()); + + auto *AddReduce = Builder.CreateAddReduce(OperationReplacement); + FinalReduction->replaceAllUsesWith(AddReduce); +} + void ComplexDeinterleavingGraph::processReductionOperation( Value *OperationReplacement, RawNodePtr Node) { auto *Real = cast(Node->Real); @@ -2059,8 +2304,13 @@ void ComplexDeinterleavingGraph::replaceNodes() { auto *RootImag = cast(RootNode->Imag); ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); - DeadInstrRoots.push_back(cast(RootReal)); - DeadInstrRoots.push_back(cast(RootImag)); + DeadInstrRoots.push_back(RootReal); + DeadInstrRoots.push_back(RootImag); + } else if (RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle) { + auto *RootInst = cast(RootNode->Real); + ReductionInfo[RootInst].first->removeIncomingValue(BackEdge); + DeadInstrRoots.push_back(ReductionInfo[RootInst].second); } else { assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cb6ba06bd4425c..d45c3cddd64de4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29542,9 +29542,16 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) { unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); + + if (Operation == ComplexDeinterleavingOperation::CDot) + return ScalarWidth == 32 || ScalarWidth == 64; return 8 <= ScalarWidth && ScalarWidth <= 64; } + // CDot is not supported outside of scalable/sve scopes + if (Operation == ComplexDeinterleavingOperation::CDot) + return false; + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); } @@ -29554,6 +29561,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); + if (Accumulator == nullptr) + Accumulator = Constant::getNullValue(Ty); bool IsScalable = Ty->isScalableTy(); bool IsInt = Ty->getElementType()->isIntegerTy(); @@ -29565,6 +29574,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( if (TyWidth > 128) { int Stride = Ty->getElementCount().getKnownMinValue() / 2; + int AccStride = cast(Accumulator->getType()) + ->getElementCount() + .getKnownMinValue() / + 2; auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); @@ -29574,25 +29587,26 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; - if (Accumulator) { - LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); - UpperSplitAcc = - B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); - } + Type *FullTy = Ty; + FullTy = Accumulator->getType(); + auto *HalfAccTy = VectorType::getHalfElementsVectorType( + cast(Accumulator->getType())); + LowerSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0)); + UpperSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride)); auto *LowerSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, - B.getInt64(0)); - return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); + auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), + LowerSplitInt, B.getInt64(0)); + return B.CreateInsertVector(FullTy, Result, UpperSplitInt, + B.getInt64(AccStride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { - if (Accumulator == nullptr) - Accumulator = Constant::getNullValue(Ty); - if (IsScalable) { if (IsInt) return B.CreateIntrinsic( @@ -29644,6 +29658,13 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); } + if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && + IsScalable) { + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_cdot, Accumulator->getType(), + {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + } + return nullptr; } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll new file mode 100644 index 00000000000000..11cf4c31936d8f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -0,0 +1,1136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE +; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @cdotp_i8_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i64 @cdotp_i16_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + + +define i32 @not_cdotp( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i16 @invalid_type( %a, %b) { +; CHECK-SVE2-LABEL: define i16 @invalid_type( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i16 [[TMP0]] +; +; CHECK-SVE-LABEL: define i16 @invalid_type( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i16 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i16 @invalid_type( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i16 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i16 @llvm.vector.reduce.add.nxv8i16( %partial.reduce.sub) + ret i16 %0 +} + +define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a) + %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b) + %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0 + %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1 + %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0 + %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1 + %a.real.ext = sext <16 x i8> %a.real to <16 x i32> + %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32> + %b.real.ext = sext <16 x i8> %b.real to <16 x i32> + %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32> + %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext + %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul) + %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul + %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub) + ret i32 %0 +} + +declare @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) + +declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64() From 9daf10ff8f29ba3a88a105aaa9d2379c21b77d35 Mon Sep 17 00:00:00 2001 From: "Oleksandr T." Date: Wed, 18 Dec 2024 12:36:23 +0200 Subject: [PATCH 09/37] Reland [Clang] skip default argument instantiation for non-defining friend declarations to meet [dcl.fct.default] p4 (#115487) This fixes a crash when instantiating default arguments for templated friend function declarations which lack a definition. There are implementation limits which prevents us from finding the pattern for such functions, and this causes difficulties setting up the instantiation scope for the function parameters. This patch skips instantiating the default argument in these cases, which causes a minor regression in error recovery, but otherwise avoids the crash. The previous attempt #113777 accidentally skipped all default argument constructions, causing some regressions. This patch resolves that by moving the guard to InstantiateDefaultArgument() where the handling of templates takes place. Fixes https://github.com/llvm/llvm-project/issues/113324 --- clang/docs/ReleaseNotes.rst | 2 ++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 11 ++++++++++ clang/test/CXX/temp/temp.res/p4.cpp | 20 +++++++++++++++++++ clang/test/CodeGenCXX/default-arguments.cpp | 11 ++++++++++ 4 files changed, 44 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 956b5532b48f65..3645dff2e6fe88 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -837,6 +837,8 @@ Bug Fixes to C++ Support missing placeholder return type. (#GH78694) - Fixed a bug where bounds of partially expanded pack indexing expressions were checked too early. (#GH116105) - Fixed an assertion failure caused by using ``consteval`` in condition in consumed analyses. (#GH117385) +- Fixed an assertion failure caused by invalid default argument substitutions in non-defining + friend declarations. (#GH113324) - Fix a crash caused by incorrect argument position in merging deduced template arguments. (#GH113659) - Fixed a parser crash when using pack indexing as a nested name specifier. (#GH119072) - Fixed a null pointer dereference issue when heuristically computing ``sizeof...(pack)`` expressions. (#GH81436) diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index c70ee73a2d8e11..e058afe81da589 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -4703,6 +4703,17 @@ bool Sema::InstantiateDefaultArgument(SourceLocation CallLoc, FunctionDecl *FD, ParmVarDecl *Param) { assert(Param->hasUninstantiatedDefaultArg()); + // FIXME: We don't track member specialization info for non-defining + // friend declarations, so we will not be able to later find the function + // pattern. As a workaround, don't instantiate the default argument in this + // case. This is correct per the standard and only an issue for recovery + // purposes. [dcl.fct.default]p4: + // if a friend declaration D specifies a default argument expression, + // that declaration shall be a definition. + if (FD->getFriendObjectKind() != Decl::FOK_None && + !FD->getTemplateInstantiationPattern()) + return true; + // Instantiate the expression. // // FIXME: Pass in a correct Pattern argument, otherwise diff --git a/clang/test/CXX/temp/temp.res/p4.cpp b/clang/test/CXX/temp/temp.res/p4.cpp index f54d8649f5da88..9dbdd235e925d1 100644 --- a/clang/test/CXX/temp/temp.res/p4.cpp +++ b/clang/test/CXX/temp/temp.res/p4.cpp @@ -185,3 +185,23 @@ template struct S { friend void X::f(T::type); }; } + +namespace GH113324 { +template struct S1 { + friend void f1(S1, int = 0); // expected-error {{friend declaration specifying a default argument must be a definition}} + friend void f2(S1 a, S1 = decltype(a){}); // expected-error {{friend declaration specifying a default argument must be a definition}} +}; + +template using alias = int; +template struct S2 { + // FIXME: We miss diagnosing the default argument instantiation failure + // (forming reference to void) + friend void f3(S2, int a = alias(1)); // expected-error {{friend declaration specifying a default argument must be a definition}} +}; + +void test() { + f1(S1<>{}); + f2(S1<>{}); + f3(S2()); +} +} // namespace GH113324 diff --git a/clang/test/CodeGenCXX/default-arguments.cpp b/clang/test/CodeGenCXX/default-arguments.cpp index 215bcd882e9625..2459ef1ad41fcd 100644 --- a/clang/test/CodeGenCXX/default-arguments.cpp +++ b/clang/test/CodeGenCXX/default-arguments.cpp @@ -12,6 +12,17 @@ void g() { } } +namespace GH113324 { +struct S1 { + friend void f(S1, int = 42) {} +}; + +void test() { + S1 s1; + f(s1); +} +}; + struct A1 { A1(); ~A1(); From 414c462a839edbcbed217b8d695e71f2ede7f952 Mon Sep 17 00:00:00 2001 From: Aaditya <115080342+easyonaadit@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:08:17 +0530 Subject: [PATCH 10/37] [AMDGPU] Modify Dyn Alloca test to account for Machine-Verifier bug (#120393) Machine-Verifier crashes in kernel functions, but fails gracefully in device functions. This is due to the buffer resource descriptor selected during G-ISEL, before the fallback path. Device functions use `$sgpr0_sgpr1_sgpr2_sgpr3`. while Kernel functions select `$private_rsrc_reg` where machine-verifier complains: `$private_rsrc_reg is not a SReg_128 register.` Modifying test case to capture both behaviors, this is related to https://github.com/llvm/llvm-project/pull/120063 --- .../AMDGPU/GlobalISel/dynamic-alloca-divergent.ll | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll index 5dae7885f6bfb1..cfe5d1c194f420 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll @@ -1,25 +1,25 @@ -; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s +; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -verify-machineinstrs=0 -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s ; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4) ; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4 ; ERR-NEXT: error: :0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (ptr addrspace(1)): unsupported dynamic alloca -; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) -; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 -; ERR-NEXT: error: :0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca - define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1) %ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id %n = load i32, ptr addrspace(1) %gep %alloca = alloca i32, i32 %n, align 4, addrspace(5) - store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + store volatile i32 123, ptr addrspace(5) %alloca ret void } +; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 +; ERR-NEXT: error: :0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca + define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { %alloca = alloca i32, i32 %n, align 4, addrspace(5) - store volatile ptr addrspace(5) %alloca, ptr addrspace(1) undef + store volatile i32 456, ptr addrspace(5) %alloca ret void } From 222dd235ffc39b3695a3c002593097bec216a8fa Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Wed, 18 Dec 2024 18:38:46 +0800 Subject: [PATCH 11/37] [clang-tidy] use local config (#120004) follow up patch for #119948. --- .../clang-tidy/misc/IncludeCleanerCheck.cpp | 7 +++---- ...InconsistentDeclarationParameterNameCheck.h | 2 +- clang-tools-extra/docs/ReleaseNotes.rst | 18 ++++++++++++++++++ .../bugprone/argument-comment-strict.cpp | 2 +- .../cppcoreguidelines/pro-type-const-cast.cpp | 2 +- .../pro-type-static-cast-downcast.cpp | 2 +- .../checkers/misc/unused-parameters-strict.cpp | 2 +- .../checkers/modernize/use-std-format.cpp | 4 ++-- .../checkers/modernize/use-std-print-absl.cpp | 4 ++-- .../checkers/modernize/use-std-print.cpp | 4 ++-- .../clang-tidy/IncludeCleanerTest.cpp | 14 ++++++++------ 11 files changed, 40 insertions(+), 21 deletions(-) diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp index 5e7a0e65690b7a..7638bbc103d16d 100644 --- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.cpp @@ -57,10 +57,9 @@ struct MissingIncludeInfo { IncludeCleanerCheck::IncludeCleanerCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - IgnoreHeaders(utils::options::parseStringList( - Options.getLocalOrGlobal("IgnoreHeaders", ""))), - DeduplicateFindings( - Options.getLocalOrGlobal("DeduplicateFindings", true)) { + IgnoreHeaders( + utils::options::parseStringList(Options.get("IgnoreHeaders", ""))), + DeduplicateFindings(Options.get("DeduplicateFindings", true)) { for (const auto &Header : IgnoreHeaders) { if (!llvm::Regex{Header}.isValid()) configurationDiag("Invalid ignore headers regex '%0'") << Header; diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h index 1c526577b403f6..0c5ead860c161a 100644 --- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h +++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h @@ -26,7 +26,7 @@ class InconsistentDeclarationParameterNameCheck : public ClangTidyCheck { ClangTidyContext *Context) : ClangTidyCheck(Name, Context), IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)), - Strict(Options.getLocalOrGlobal("Strict", false)) {} + Strict(Options.get("Strict", false)) {} void storeOptions(ClangTidyOptions::OptionMap &Opts) override; void registerMatchers(ast_matchers::MatchFinder *Finder) override; diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 6803842106791b..3fd7a4f9da18ad 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -115,6 +115,24 @@ Improvements to clang-tidy - Improved :program:`run-clang-tidy.py` script. Fixed minor shutdown noise happening on certain platforms when interrupting the script. +- Removed :program:`clang-tidy`'s global options for most of checks. All options + are changed to local options except `IncludeStyle`, `StrictMode` and + `IgnoreMacros`. + +.. csv-table:: + :header: "Check", "Options removed from global option" + + :doc:`bugprone-reserved-identifier `, AggressiveDependentMemberLookup + :doc:`bugprone-unchecked-optional-access `, IgnoreSmartPointerDereference + :doc:`cppcoreguidelines-pro-type-member-init `, UseAssignment + :doc:`cppcoreguidelines-rvalue-reference-param-not-moved `, AllowPartialMove; IgnoreUnnamedParams; IgnoreNonDeducedTemplateTypes + :doc:`misc-include-cleaner `, IgnoreHeaders; DeduplicateFindings + :doc:`performance-inefficient-vector-operation `, EnableProto + :doc:`readability-identifier-naming `, AggressiveDependentMemberLookup + :doc:`readability-inconsistent-declaration-parameter-name `, Strict + :doc:`readability-redundant-access-specifiers `, CheckFirstDeclaration + :doc:`readability-redundant-casting `, IgnoreTypeAliases + New checks ^^^^^^^^^^ diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-strict.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-strict.cpp index c25d25ac5738fb..38d91f39846478 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-strict.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/argument-comment-strict.cpp @@ -1,5 +1,5 @@ // RUN: %check_clang_tidy %s bugprone-argument-comment %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: true}}" -- +// RUN: -config="{CheckOptions: {bugprone-argument-comment.StrictMode: true}}" -- void f(int _with_underscores_); void g(int x_); diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp index be70e3ba356991..a775334260e35c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -check-suffix=STRICT %s cppcoreguidelines-pro-type-const-cast %t -- -config="{CheckOptions: {StrictMode: true}}" +// RUN: %check_clang_tidy -check-suffix=STRICT %s cppcoreguidelines-pro-type-const-cast %t -- -config="{CheckOptions: {cppcoreguidelines-pro-type-const-cast.StrictMode: true}}" // RUN: %check_clang_tidy -check-suffix=NSTRICT %s cppcoreguidelines-pro-type-const-cast %t namespace Const { diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp index 11179b7d2d19b8..a3c73a960974ba 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp @@ -1,5 +1,5 @@ // RUN: %check_clang_tidy -check-suffixes=NSTRICT,STRICT %s cppcoreguidelines-pro-type-static-cast-downcast %t -// RUN: %check_clang_tidy -check-suffix=NSTRICT %s cppcoreguidelines-pro-type-static-cast-downcast %t -- -config="{CheckOptions: {StrictMode: false}}" +// RUN: %check_clang_tidy -check-suffix=NSTRICT %s cppcoreguidelines-pro-type-static-cast-downcast %t -- -config="{CheckOptions: {cppcoreguidelines-pro-type-static-cast-downcast.StrictMode: false}}" class Base { }; diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters-strict.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters-strict.cpp index f8385c1a17e7bb..319cefa1c68f10 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters-strict.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters-strict.cpp @@ -1,5 +1,5 @@ // RUN: %check_clang_tidy %s misc-unused-parameters %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: true}}" -- +// RUN: -config="{CheckOptions: {misc-unused-parameters.StrictMode: true}}" -- // Warn on empty function bodies in StrictMode. namespace strict_mode { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp index 42fb3382e4a936..0a5a63eba2596a 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp @@ -1,12 +1,12 @@ // RUN: %check_clang_tidy \ // RUN: -std=c++20 %s modernize-use-std-format %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: true}}" \ +// RUN: -config="{CheckOptions: {modernize-use-std-format.StrictMode: true}}" \ // RUN: -- -isystem %clang_tidy_headers \ // RUN: -DPRI_CMDLINE_MACRO="\"s\"" \ // RUN: -D__PRI_CMDLINE_MACRO="\"s\"" // RUN: %check_clang_tidy \ // RUN: -std=c++20 %s modernize-use-std-format %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: false}}" \ +// RUN: -config="{CheckOptions: {modernize-use-std-format.StrictMode: false}}" \ // RUN: -- -isystem %clang_tidy_headers \ // RUN: -DPRI_CMDLINE_MACRO="\"s\"" \ // RUN: -D__PRI_CMDLINE_MACRO="\"s\"" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-absl.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-absl.cpp index 95c32837e4447b..83fbd2e7500c5b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-absl.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print-absl.cpp @@ -1,10 +1,10 @@ // RUN: %check_clang_tidy \ // RUN: -std=c++23 %s modernize-use-std-print %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: true}}" \ +// RUN: -config="{CheckOptions: {modernize-use-std-print.StrictMode: true}}" \ // RUN: -- -isystem %clang_tidy_headers // RUN: %check_clang_tidy \ // RUN: -std=c++23 %s modernize-use-std-print %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: false}}" \ +// RUN: -config="{CheckOptions: {modernize-use-std-print.StrictMode: false}}" \ // RUN: -- -isystem %clang_tidy_headers #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp index f11fc408fcb9c8..5da995d9d6e830 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp @@ -1,12 +1,12 @@ // RUN: %check_clang_tidy -check-suffixes=,STRICT \ // RUN: -std=c++23 %s modernize-use-std-print %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: true}}" \ +// RUN: -config="{CheckOptions: {modernize-use-std-print.StrictMode: true}}" \ // RUN: -- -isystem %clang_tidy_headers -fexceptions \ // RUN: -DPRI_CMDLINE_MACRO="\"s\"" \ // RUN: -D__PRI_CMDLINE_MACRO="\"s\"" // RUN: %check_clang_tidy -check-suffixes=,NOTSTRICT \ // RUN: -std=c++23 %s modernize-use-std-print %t -- \ -// RUN: -config="{CheckOptions: {StrictMode: false}}" \ +// RUN: -config="{CheckOptions: {modernize-use-std-print.StrictMode: false}}" \ // RUN: -- -isystem %clang_tidy_headers -fexceptions \ // RUN: -DPRI_CMDLINE_MACRO="\"s\"" \ // RUN: -D__PRI_CMDLINE_MACRO="\"s\"" diff --git a/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp b/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp index d400cf6fe2d576..3d6ec995e443d4 100644 --- a/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp +++ b/clang-tools-extra/unittests/clang-tidy/IncludeCleanerTest.cpp @@ -71,10 +71,12 @@ TEST(IncludeCleanerCheckTest, SuppressUnusedIncludes) { std::vector Errors; ClangTidyOptions Opts; - Opts.CheckOptions["IgnoreHeaders"] = llvm::StringRef{llvm::formatv( - "bar.h;{0};{1};vector;;", - llvm::Regex::escape(appendPathFileSystemIndependent({"foo", "qux.h"})), - llvm::Regex::escape(appendPathFileSystemIndependent({"baz", "qux"})))}; + Opts.CheckOptions["test-check-0.IgnoreHeaders"] = llvm::StringRef{ + llvm::formatv("bar.h;{0};{1};vector;;", + llvm::Regex::escape( + appendPathFileSystemIndependent({"foo", "qux.h"})), + llvm::Regex::escape( + appendPathFileSystemIndependent({"baz", "qux"})))}; EXPECT_EQ( PostCode, runCheckOnCode( @@ -139,7 +141,7 @@ int BarResult2 = $diag2^bar();)"); { std::vector Errors; ClangTidyOptions Opts; - Opts.CheckOptions.insert({"DeduplicateFindings", "false"}); + Opts.CheckOptions["test-check-0.DeduplicateFindings"] = "false"; runCheckOnCode(Code.code(), &Errors, "file.cpp", {}, Opts, {{"baz.h", R"(#pragma once @@ -170,7 +172,7 @@ std::vector x; )"; ClangTidyOptions Opts; - Opts.CheckOptions["IgnoreHeaders"] = llvm::StringRef{ + Opts.CheckOptions["test-check-0.IgnoreHeaders"] = llvm::StringRef{ "public.h;;baz.h;" + llvm::Regex::escape(appendPathFileSystemIndependent({"foo", "qux.h"}))}; std::vector Errors; From 41c1992a16997229469aa08bc195919e96d18211 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 18 Dec 2024 11:41:44 +0100 Subject: [PATCH 12/37] [NVPTX] fix nvcl-param-align.ll fix for f9c8c01d38f8fbea81db99ab90b7d0f2bdcc8b4d --- llvm/test/CodeGen/NVPTX/nvcl-param-align.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll index acf72ef09d3aa7..48162eaba257de 100644 --- a/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/nvcl-param-align.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_20 | %ptxas-verify %} +; RUN: llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-nvidia-nvcl -mcpu=sm_60 | %ptxas-verify %} target triple = "nvptx-unknown-nvcl" From 7384d8bc18535286a24b4422f6661109d127e8fd Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Wed, 18 Dec 2024 19:54:00 +0900 Subject: [PATCH 13/37] SourceCoverageViewHTML.cpp: Reformat JS --- .../tools/llvm-cov/SourceCoverageViewHTML.cpp | 55 +++++++++---------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp index 0175deb1c848dc..1ca1c1d86bda06 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp @@ -90,40 +90,38 @@ const char *BeginHeader = const char *JSForCoverage = R"javascript( - function next_uncovered(selector, reverse, scroll_selector) { function visit_element(element) { element.classList.add("seen"); element.classList.add("selected"); - - if (!scroll_selector) { - scroll_selector = "tr:has(.selected) td.line-number" - } - - const scroll_to = document.querySelector(scroll_selector); - if (scroll_to) { - scroll_to.scrollIntoView({behavior: "smooth", block: "center", inline: "end"}); - } - + + if (!scroll_selector) { + scroll_selector = "tr:has(.selected) td.line-number" + } + + const scroll_to = document.querySelector(scroll_selector); + if (scroll_to) { + scroll_to.scrollIntoView({behavior: "smooth", block: "center", inline: "end"}); + } } - + function select_one() { if (!reverse) { const previously_selected = document.querySelector(".selected"); - + if (previously_selected) { previously_selected.classList.remove("selected"); } - + return document.querySelector(selector + ":not(.seen)"); - } else { + } else { const previously_selected = document.querySelector(".selected"); - + if (previously_selected) { previously_selected.classList.remove("selected"); previously_selected.classList.remove("seen"); } - + const nodes = document.querySelectorAll(selector + ".seen"); if (nodes) { const last = nodes[nodes.length - 1]; // last @@ -133,54 +131,52 @@ function next_uncovered(selector, reverse, scroll_selector) { } } } - + function reset_all() { if (!reverse) { const all_seen = document.querySelectorAll(selector + ".seen"); - + if (all_seen) { all_seen.forEach(e => e.classList.remove("seen")); } } else { const all_seen = document.querySelectorAll(selector + ":not(.seen)"); - + if (all_seen) { all_seen.forEach(e => e.classList.add("seen")); } } - + } - + const uncovered = select_one(); if (uncovered) { visit_element(uncovered); } else { reset_all(); - - + const uncovered = select_one(); - + if (uncovered) { visit_element(uncovered); } } } -function next_line(reverse) { +function next_line(reverse) { next_uncovered("td.uncovered-line", reverse) } -function next_region(reverse) { +function next_region(reverse) { next_uncovered("span.red.region", reverse); } -function next_branch(reverse) { +function next_branch(reverse) { next_uncovered("span.red.branch", reverse); } document.addEventListener("keypress", function(event) { - console.log(event); const reverse = event.shiftKey; if (event.code == "KeyL") { next_line(reverse); @@ -191,7 +187,6 @@ document.addEventListener("keypress", function(event) { if (event.code == "KeyR") { next_region(reverse); } - }); )javascript"; From 5a5838fba37153adb7885c897131dda09227eb2d Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Wed, 18 Dec 2024 19:53:10 +0900 Subject: [PATCH 14/37] Introduce CounterMappingRegion::isBranch(). NFC. --- clang/lib/CodeGen/CoverageMappingGen.cpp | 3 +-- llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h | 7 +++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 96c89b2728e5b7..7248abe480cba8 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -2370,8 +2370,7 @@ static void dump(llvm::raw_ostream &OS, StringRef FunctionName, } else { Ctx.dump(R.Count, OS); - if (R.Kind == CounterMappingRegion::BranchRegion || - R.Kind == CounterMappingRegion::MCDCBranchRegion) { + if (R.isBranch()) { OS << ", "; Ctx.dump(R.FalseCount, OS); } diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h index 590185d42e72a6..42da188fef34ee 100644 --- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h +++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h @@ -272,6 +272,10 @@ struct CounterMappingRegion { RegionKind Kind; + bool isBranch() const { + return (Kind == BranchRegion || Kind == MCDCBranchRegion); + } + CounterMappingRegion(Counter Count, unsigned FileID, unsigned ExpandedFileID, unsigned LineStart, unsigned ColumnStart, unsigned LineEnd, unsigned ColumnEnd, RegionKind Kind) @@ -716,8 +720,7 @@ struct FunctionRecord { void pushRegion(CounterMappingRegion Region, uint64_t Count, uint64_t FalseCount, bool HasSingleByteCoverage) { - if (Region.Kind == CounterMappingRegion::BranchRegion || - Region.Kind == CounterMappingRegion::MCDCBranchRegion) { + if (Region.isBranch()) { CountedBranchRegions.emplace_back(Region, Count, FalseCount, HasSingleByteCoverage); // If either counter is hard-coded to zero, then this region represents a From a9df1f6cb0dcdd808abc25f7fa1555e9e0ec6a9f Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Wed, 18 Dec 2024 19:53:33 +0900 Subject: [PATCH 15/37] llvm-cov: Refactor SourceCoverageView::renderBranchView(). NFC except for calculating `Total`. I've replaced `(uint64_t)+(uint64_t)` with `(double)+(double)`. This is still inexact with large numbers `(1LL << 53)` but will be expected to prevent possible overflow. --- .../tools/llvm-cov/SourceCoverageViewHTML.cpp | 70 +++++++------------ .../tools/llvm-cov/SourceCoverageViewText.cpp | 65 ++++++++--------- 2 files changed, 53 insertions(+), 82 deletions(-) diff --git a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp index 1ca1c1d86bda06..e2be576b93cdaf 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewHTML.cpp @@ -1096,20 +1096,31 @@ void SourceCoverageViewHTML::renderBranchView(raw_ostream &OS, BranchView &BRV, if (getOptions().Debug) errs() << "Branch at line " << BRV.getLine() << '\n'; + auto BranchCount = [&](StringRef Label, uint64_t Count, bool Folded, + double Total) { + if (Folded) + return std::string{"Folded"}; + + std::string Str; + raw_string_ostream OS(Str); + + OS << tag("span", Label, (Count ? "None" : "red branch")) << ": "; + if (getOptions().ShowBranchCounts) + OS << tag("span", formatCount(Count), + (Count ? "covered-line" : "uncovered-line")); + else + OS << format("%0.2f", (Total != 0 ? 100.0 * Count / Total : 0.0)) << "%"; + + return Str; + }; + OS << BeginExpansionDiv; OS << BeginPre; for (const auto &R : BRV.Regions) { - // Calculate TruePercent and False Percent. - double TruePercent = 0.0; - double FalsePercent = 0.0; - // FIXME: It may overflow when the data is too large, but I have not - // encountered it in actual use, and not sure whether to use __uint128_t. - uint64_t Total = R.ExecutionCount + R.FalseExecutionCount; - - if (!getOptions().ShowBranchCounts && Total != 0) { - TruePercent = ((double)(R.ExecutionCount) / (double)Total) * 100.0; - FalsePercent = ((double)(R.FalseExecutionCount) / (double)Total) * 100.0; - } + // This can be `double` since it is only used as a denominator. + // FIXME: It is still inaccurate if Count is greater than (1LL << 53). + double Total = + static_cast(R.ExecutionCount) + R.FalseExecutionCount; // Display Line + Column. std::string LineNoStr = utostr(uint64_t(R.LineStart)); @@ -1128,40 +1139,9 @@ void SourceCoverageViewHTML::renderBranchView(raw_ostream &OS, BranchView &BRV, continue; } - // Display TrueCount or TruePercent. - std::string TrueColor = - (R.TrueFolded || R.ExecutionCount ? "None" : "red branch"); - std::string TrueCovClass = - (R.TrueFolded || R.ExecutionCount > 0 ? "covered-line" - : "uncovered-line"); - - if (R.TrueFolded) - OS << "Folded, "; - else { - OS << tag("span", "True", TrueColor) << ": "; - if (getOptions().ShowBranchCounts) - OS << tag("span", formatCount(R.ExecutionCount), TrueCovClass) << ", "; - else - OS << format("%0.2f", TruePercent) << "%, "; - } - - // Display FalseCount or FalsePercent. - std::string FalseColor = - (R.FalseFolded || R.FalseExecutionCount ? "None" : "red branch"); - std::string FalseCovClass = - (R.FalseFolded || R.FalseExecutionCount > 0 ? "covered-line" - : "uncovered-line"); - - if (R.FalseFolded) - OS << "Folded]\n"; - else { - OS << tag("span", "False", FalseColor) << ": "; - if (getOptions().ShowBranchCounts) - OS << tag("span", formatCount(R.FalseExecutionCount), FalseCovClass) - << "]\n"; - else - OS << format("%0.2f", FalsePercent) << "%]\n"; - } + OS << BranchCount("True", R.ExecutionCount, R.TrueFolded, Total) << ", " + << BranchCount("False", R.FalseExecutionCount, R.FalseFolded, Total) + << "]\n"; } OS << EndPre; OS << EndExpansionDiv; diff --git a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp index 444f33dac10837..63f8248e3387ba 100644 --- a/llvm/tools/llvm-cov/SourceCoverageViewText.cpp +++ b/llvm/tools/llvm-cov/SourceCoverageViewText.cpp @@ -294,17 +294,32 @@ void SourceCoverageViewText::renderBranchView(raw_ostream &OS, BranchView &BRV, if (getOptions().Debug) errs() << "Branch at line " << BRV.getLine() << '\n'; + auto BranchCount = [&](StringRef Label, uint64_t Count, bool Folded, + double Total) { + if (Folded) + return std::string{"Folded"}; + + std::string Str; + raw_string_ostream OS(Str); + + colored_ostream(OS, raw_ostream::RED, getOptions().Colors && !Count, + /*Bold=*/false, /*BG=*/true) + << Label; + + if (getOptions().ShowBranchCounts) + OS << ": " << formatCount(Count); + else + OS << ": " << format("%0.2f", (Total != 0 ? 100.0 * Count / Total : 0.0)) + << "%"; + + return Str; + }; + for (const auto &R : BRV.Regions) { - double TruePercent = 0.0; - double FalsePercent = 0.0; - // FIXME: It may overflow when the data is too large, but I have not - // encountered it in actual use, and not sure whether to use __uint128_t. - uint64_t Total = R.ExecutionCount + R.FalseExecutionCount; - - if (!getOptions().ShowBranchCounts && Total != 0) { - TruePercent = ((double)(R.ExecutionCount) / (double)Total) * 100.0; - FalsePercent = ((double)(R.FalseExecutionCount) / (double)Total) * 100.0; - } + // This can be `double` since it is only used as a denominator. + // FIXME: It is still inaccurate if Count is greater than (1LL << 53). + double Total = + static_cast(R.ExecutionCount) + R.FalseExecutionCount; renderLinePrefix(OS, ViewDepth); OS << " Branch (" << R.LineStart << ":" << R.ColumnStart << "): ["; @@ -314,33 +329,9 @@ void SourceCoverageViewText::renderBranchView(raw_ostream &OS, BranchView &BRV, continue; } - if (R.TrueFolded) - OS << "Folded, "; - else { - colored_ostream(OS, raw_ostream::RED, - getOptions().Colors && !R.ExecutionCount, - /*Bold=*/false, /*BG=*/true) - << "True"; - - if (getOptions().ShowBranchCounts) - OS << ": " << formatCount(R.ExecutionCount) << ", "; - else - OS << ": " << format("%0.2f", TruePercent) << "%, "; - } - - if (R.FalseFolded) - OS << "Folded]\n"; - else { - colored_ostream(OS, raw_ostream::RED, - getOptions().Colors && !R.FalseExecutionCount, - /*Bold=*/false, /*BG=*/true) - << "False"; - - if (getOptions().ShowBranchCounts) - OS << ": " << formatCount(R.FalseExecutionCount) << "]\n"; - else - OS << ": " << format("%0.2f", FalsePercent) << "%]\n"; - } + OS << BranchCount("True", R.ExecutionCount, R.TrueFolded, Total) << ", " + << BranchCount("False", R.FalseExecutionCount, R.FalseFolded, Total) + << "]\n"; } } From 95eb49a0905568a13c840b7866ce5d9c47e022f0 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 18 Dec 2024 11:22:39 +0000 Subject: [PATCH 16/37] [SCEV] Bail out on mixed int/pointer in SCEVWrapPredicate::implies. Fixes a crash when trying to extend the pointer start value to a narrow integer type after b6c29fdffd65. --- llvm/lib/Analysis/ScalarEvolution.cpp | 7 +- .../LoopAccessAnalysis/nusw-predicates.ll | 120 ++++++++++++++++++ 2 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index c820e8bf7266ad..d55d09020fc147 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -14978,6 +14978,11 @@ bool SCEVWrapPredicate::implies(const SCEVPredicate *N, Flags != SCEVWrapPredicate::IncrementNUSW) return false; + const SCEV *Start = AR->getStart(); + const SCEV *OpStart = Op->AR->getStart(); + if (Start->getType()->isPointerTy() != OpStart->getType()->isPointerTy()) + return false; + const SCEV *Step = AR->getStepRecurrence(SE); const SCEV *OpStep = Op->AR->getStepRecurrence(SE); if (!SE.isKnownPositive(Step) || !SE.isKnownPositive(OpStep)) @@ -14990,8 +14995,6 @@ bool SCEVWrapPredicate::implies(const SCEVPredicate *N, OpStep = SE.getNoopOrZeroExtend(OpStep, WiderTy); bool IsNUW = Flags == SCEVWrapPredicate::IncrementNUSW; - const SCEV *OpStart = Op->AR->getStart(); - const SCEV *Start = AR->getStart(); OpStart = IsNUW ? SE.getNoopOrZeroExtend(OpStart, WiderTy) : SE.getNoopOrSignExtend(OpStart, WiderTy); Start = IsNUW ? SE.getNoopOrZeroExtend(Start, WiderTy) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll b/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll new file mode 100644 index 00000000000000..5234d8f107271a --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "p:16:16" + +define void @int_and_pointer_predicate(ptr %v, i32 %N) { +; CHECK-LABEL: 'int_and_pointer_predicate' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: store i16 0, ptr %gep.iv.i16, align 1 -> +; CHECK-NEXT: store i16 0, ptr %v, align 1 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-NEXT: {%v,+,4}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.i16 = trunc i64 %iv to i16 + %gep.iv.i16 = getelementptr { i16, i16 }, ptr %v, i16 %iv.i16 + store i16 0, ptr %gep.iv.i16, align 1 + store i16 0, ptr %v, align 1 + %iv.next = add i64 %iv, 1 + %iv.i32 = trunc i64 %iv to i32 + %.not = icmp ult i32 %N, %iv.i32 + br i1 %.not, label %exit, label %loop + +exit: + ret void +} + +define void @int_and_multiple_pointer_predicates(ptr %v, ptr %w, i32 %N) { +; CHECK-LABEL: 'int_and_multiple_pointer_predicates' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: store i16 0, ptr %gep.v, align 1 -> +; CHECK-NEXT: store i16 0, ptr %v, align 1 +; CHECK-EMPTY: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: store i16 0, ptr %gep.w, align 1 -> +; CHECK-NEXT: store i16 0, ptr %w, align 1 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %v +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %w +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP1]]): +; CHECK-NEXT: ptr %v +; CHECK-NEXT: Against group ([[GRP3:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.w = getelementptr { i16, i16 }, ptr %w, i16 %iv.i16 +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP4:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.v = getelementptr { i16, i16 }, ptr %v, i16 %iv.i16 +; CHECK-NEXT: Against group ([[GRP2]]): +; CHECK-NEXT: ptr %w +; CHECK-NEXT: Check 3: +; CHECK-NEXT: Comparing group ([[GRP4]]): +; CHECK-NEXT: %gep.v = getelementptr { i16, i16 }, ptr %v, i16 %iv.i16 +; CHECK-NEXT: Against group ([[GRP3]]): +; CHECK-NEXT: %gep.w = getelementptr { i16, i16 }, ptr %w, i16 %iv.i16 +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: %v High: (2 + %v)) +; CHECK-NEXT: Member: %v +; CHECK-NEXT: Group [[GRP4]]: +; CHECK-NEXT: (Low: %v High: (6 + (4 * (trunc i32 %N to i16)) + %v)) +; CHECK-NEXT: Member: {%v,+,4}<%loop> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %w High: (2 + %w)) +; CHECK-NEXT: Member: %w +; CHECK-NEXT: Group [[GRP3]]: +; CHECK-NEXT: (Low: %w High: (6 + (4 * (trunc i32 %N to i16)) + %w)) +; CHECK-NEXT: Member: {%w,+,4}<%loop> +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {0,+,1}<%loop> Added Flags: +; CHECK-NEXT: {%v,+,4}<%loop> Added Flags: +; CHECK-NEXT: {%w,+,4}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %iv.i16 = trunc i64 %iv to i16 + %gep.v = getelementptr { i16, i16 }, ptr %v, i16 %iv.i16 + store i16 0, ptr %gep.v, align 1 + store i16 0, ptr %v, align 1 + %gep.w = getelementptr { i16, i16 }, ptr %w, i16 %iv.i16 + store i16 0, ptr %gep.w, align 1 + store i16 0, ptr %w, align 1 + %iv.next = add i64 %iv, 1 + %iv.i32 = trunc i64 %iv to i32 + %.not = icmp ult i32 %N, %iv.i32 + br i1 %.not, label %exit, label %loop + +exit: + ret void +} From 9826201093f047164733982492e25151b28404df Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Wed, 18 Dec 2024 12:36:22 +0100 Subject: [PATCH 17/37] LLVMContext: rem constexpr to unblock build w/ gcc (#120402) Address issues observed in buildbots with older GCC versions: https://lab.llvm.org/buildbot/#/builders/140/builds/13302 --- llvm/lib/IR/LLVMContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp index 9acc15f11316a2..447e5d92e0b99d 100644 --- a/llvm/lib/IR/LLVMContext.cpp +++ b/llvm/lib/IR/LLVMContext.cpp @@ -31,7 +31,7 @@ using namespace llvm; -static constexpr StringRef knownBundleName(unsigned BundleTagID) { +static StringRef knownBundleName(unsigned BundleTagID) { switch (BundleTagID) { case LLVMContext::OB_deopt: return "deopt"; From dd8e1adbf22f9b84e9fc5ed65530df55a3c3b693 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Dec 2024 11:36:54 +0000 Subject: [PATCH 18/37] [X86] LowerShift - track the number and location of constant shift elements. (#120270) We have several vector shift lowering strategies that have to analyse the distribution of non-uniform constant vector shift amounts, at the moment there is very little sharing of data between these analysis. This patch creates a SmallDenseMap of the different LEGAL constant shift amounts used, with a mask of which elements they are used in. So far I've only updated the shuffle(immshift(x,c1),immshift(x,c2)) lowering pattern to use it for clarity, there's several more that can be done in followups. Its hoped that the proposed patch #117980 can be simplified after this patch as well. vec_shift6.ll - the existing shuffle(immshift(x,c1),immshift(x,c2)) lowering bails on out of range shift amounts, while this patch now skips them and treats them as UNDEF - this means we manage to fold more cases that before would have to lower to a SHL->MUL pattern, including some legalized cases. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 69 +++++++++++---------- llvm/test/CodeGen/X86/vec_shift6.ll | 37 +++++++---- llvm/test/CodeGen/X86/vector-fshl-sub128.ll | 4 +- llvm/test/CodeGen/X86/vector-fshr-sub128.ll | 49 +++++++-------- 4 files changed, 85 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2479bc3fd8f080..ad5e2e7ea83bbb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30057,6 +30057,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // Build a map of inrange constant amounts with element mask where they occur. + SmallDenseMap UniqueCstAmt; + if (ConstantAmt) { + for (unsigned I = 0; I != NumElts; ++I) { + SDValue A = Amt.getOperand(I); + if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits)) + continue; + unsigned CstAmt = A->getAsAPIntVal().getZExtValue(); + if (UniqueCstAmt.count(CstAmt)) { + UniqueCstAmt[CstAmt].setBit(I); + continue; + } + UniqueCstAmt[CstAmt] = APInt::getOneBitSet(NumElts, I); + } + assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts"); + } + // If possible, lower this shift as a sequence of two shifts by // constant plus a BLENDing shuffle instead of scalarizing it. // Example: @@ -30067,45 +30084,31 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes in parallel before blending. - if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 || - (VT == MVT::v16i16 && Subtarget.hasInt256()))) { - SDValue Amt1, Amt2; - SmallVector ShuffleMask; - for (unsigned i = 0; i != NumElts; ++i) { - SDValue A = Amt->getOperand(i); - if (A.isUndef()) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - if (!Amt1 || Amt1 == A) { - ShuffleMask.push_back(i); - Amt1 = A; - continue; - } - if (!Amt2 || Amt2 == A) { - ShuffleMask.push_back(i + NumElts); - Amt2 = A; - continue; - } - break; + if (UniqueCstAmt.size() == 2 && + (VT == MVT::v8i16 || VT == MVT::v4i32 || + (VT == MVT::v16i16 && Subtarget.hasInt256()))) { + unsigned AmtA = UniqueCstAmt.begin()->first; + unsigned AmtB = std::next(UniqueCstAmt.begin())->first; + const APInt &MaskA = UniqueCstAmt.begin()->second; + const APInt &MaskB = std::next(UniqueCstAmt.begin())->second; + SmallVector ShuffleMask(NumElts, SM_SentinelUndef); + for (unsigned I = 0; I != NumElts; ++I) { + if (MaskA[I]) + ShuffleMask[I] = I; + if (MaskB[I]) + ShuffleMask[I] = I + NumElts; } // Only perform this blend if we can perform it without loading a mask. - if (ShuffleMask.size() == NumElts && Amt1 && Amt2 && - (VT != MVT::v16i16 || + if ((VT != MVT::v16i16 || is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) && (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL || canWidenShuffleElements(ShuffleMask))) { - auto *Cst1 = dyn_cast(Amt1); - auto *Cst2 = dyn_cast(Amt2); - if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) && - Cst2->getAPIntValue().ult(EltSizeInBits)) { - SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, - Cst1->getZExtValue(), DAG); - SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, - Cst2->getZExtValue(), DAG); - return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); - } + SDValue Shift1 = + getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtA, DAG); + SDValue Shift2 = + getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtB, DAG); + return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); } } diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index 59bc3940fcb31e..48ed39e5da88f2 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -22,15 +22,27 @@ define <8 x i16> @test1(<8 x i16> %a) { ret <8 x i16> %shl } +; Only two legal shift amounts, so we can lower to shuffle(psllw(),psllw()) + define <8 x i16> @test2(<8 x i16> %a) { -; SSE-LABEL: test2: -; SSE: # %bb.0: -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,1,1,2,u,u,2] -; SSE-NEXT: retq +; SSE2-LABEL: test2: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllw $1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: test2: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psllw $1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: test2: ; AVX: # %bb.0: -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,1,1,2,u,u,2] +; AVX-NEXT: vpsllw $1, %xmm0, %xmm1 +; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX-NEXT: retq %shl = shl <8 x i16> %a, ret <8 x i16> %shl @@ -43,17 +55,18 @@ define <8 x i16> @test2(<8 x i16> %a) { define <4 x i32> @test3(<4 x i32> %a) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $1, %xmm1 +; SSE2-NEXT: pslld $2, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test3: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $2, %xmm1 +; SSE41-NEXT: pslld $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: test3: diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index d8e45ed9151d87..eb4d84b8d7dd62 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -337,7 +337,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: psrld $27, %xmm2 ; SSE41-NEXT: psrld $28, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -346,7 +346,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2 ; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index a6067a960fc0d6..58dc17988b6469 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -379,16 +379,11 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE2-NEXT: psrld $4, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] -; SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000 -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pslld $28, %xmm0 +; SSE2-NEXT: pslld $27, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v2i32: @@ -400,7 +395,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; SSE41-NEXT: psrld $4, %xmm3 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pslld $27, %xmm1 +; SSE41-NEXT: pslld $28, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: por %xmm3, %xmm0 ; SSE41-NEXT: retq ; @@ -411,7 +409,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpslld $27, %xmm0, %xmm2 +; AVX1-NEXT: vpslld $28, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -482,22 +482,17 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; ; X86-SSE2-LABEL: constant_funnnel_v2i32: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE2-NEXT: psrld $5, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE2-NEXT: psrld $4, %xmm2 -; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3] -; X86-SSE2-NEXT: movl $268435456, %eax # imm = 0x10000000 -; X86-SSE2-NEXT: movd %eax, %xmm1 -; X86-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE2-NEXT: por %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrld $5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 +; X86-SSE2-NEXT: psrld $4, %xmm3 +; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pslld $28, %xmm0 +; X86-SSE2-NEXT: pslld $27, %xmm1 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) ret <2 x i32> %res From 1941f341722178390f71e07502e08a2250a704c7 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Wed, 18 Dec 2024 14:44:55 +0300 Subject: [PATCH 19/37] [TableGen][GISel] Import more "multi-level" patterns (#120332) Previously, if the destination DAG has an untyped leaf, we would import the pattern only if that leaf is defined by the *top-level* source DAG. This is an unnecessary restriction. Here is an example of such pattern: ``` def : Pat<(add (mul v8i16:$vA, v8i16:$vB), v8i16:$vC), (VMLADDUHM $vA, $vB, $vC)>; ``` Previously, it failed to import because `add` doesn't define neither `$vA` nor `$vB`. This change reduces the number of skipped patterns as follows: ``` AArch64: 8695 -> 8548 (-147) AMDGPU: 11333 -> 11240 (-93) ARM: 4297 -> 4278 (-1) PowerPC: 3955 -> 3010 (-945) ``` Other GISel-enabled targets are unaffected. --- .../GlobalISel/inst-select-ashr.s16.mir | 14 +- .../AMDGPU/GlobalISel/inst-select-ctpop.mir | 25 +- .../GlobalISel/inst-select-lshr.s16.mir | 14 +- .../AMDGPU/GlobalISel/inst-select-shl.s16.mir | 14 +- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 170 +- llvm/test/CodeGen/AMDGPU/constrained-shift.ll | 4 - .../CodeGen/AMDGPU/integer-mad-patterns.ll | 1612 +++++++---------- llvm/utils/TableGen/GlobalISelEmitter.cpp | 11 +- 8 files changed, 821 insertions(+), 1043 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir index fb7c2d4d705e75..95d2bae98df2e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir @@ -274,24 +274,18 @@ body: | ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX8-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ASHRREV_I16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ASHRREV_I16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: ashr_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir index 779312596313a3..3a2ed71e4d2242 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ctpop.mir @@ -79,9 +79,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CTPOP %0 @@ -104,9 +103,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY1]], [[V_BCNT_U32_B32_e64_]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = G_CTPOP %0 @@ -155,9 +153,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CTPOP %0 @@ -181,9 +178,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], [[COPY]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:sgpr(s32) = COPY $sgpr0 %2:vgpr(s32) = G_CTPOP %1 @@ -207,9 +203,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[S_BCNT1_I32_B32_:%[0-9]+]]:sreg_32 = S_BCNT1_I32_B32 [[COPY]], implicit-def dead $scc - ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[S_BCNT1_I32_B32_]], [[COPY1]], 0, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]] + ; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:sgpr(s32) = G_CTPOP %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir index e7ec5fcbba2473..a96b574a647848 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir @@ -272,24 +272,18 @@ body: | ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX8-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHRREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHRREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: lshr_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir index bcb6d75c18302b..b0703a642e033a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir @@ -272,24 +272,18 @@ body: | ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec - ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX9-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 - ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 ; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]] ; GFX10-LABEL: name: shl_s16_vv_zext_to_s64 ; GFX10: liveins: $vgpr0, $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 6bb4e2d3dbe26e..ed85fb19d90517 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -204,18 +204,37 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) } define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { -; GCN-LABEL: vector_xnor_i32_one_use: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: vector_xnor_i32_one_use: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: vector_xnor_i32_one_use: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: vector_xnor_i32_one_use: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: vector_xnor_i32_one_use: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: vector_xnor_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i32 %a, %b @@ -224,22 +243,45 @@ entry: } define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { -; GCN-LABEL: vector_xnor_i64_one_use: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v3 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: v_not_b32_e32 v1, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: vector_xnor_i64_one_use: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: v_not_b32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: vector_xnor_i64_one_use: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: vector_xnor_i64_one_use: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX900-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: vector_xnor_i64_one_use: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v2 +; GFX906-NEXT: v_xnor_b32_e32 v1, v1, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: vector_xnor_i64_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: v_not_b32_e32 v1, v1 +; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_xnor_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %xor = xor i64 %a, %b @@ -248,16 +290,32 @@ entry: } define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) { -; GCN-LABEL: xnor_s_v_i32_one_use: -; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX7-LABEL: xnor_s_v_i32_one_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: xnor_s_v_i32_one_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX900-LABEL: xnor_s_v_i32_one_use: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX906-LABEL: xnor_s_v_i32_one_use: +; GFX906: ; %bb.0: +; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 +; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_s_v_i32_one_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_xnor_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %s, %v %d = xor i32 %xor, -1 @@ -266,16 +324,32 @@ define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) { } define amdgpu_ps float @xnor_v_s_i32_one_use(i32 inreg %s, i32 %v) { -; GCN-LABEL: xnor_v_s_i32_one_use: -; GCN: ; %bb.0: -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_not_b32_e32 v0, v0 -; GCN-NEXT: ; return to shader part epilog +; GFX7-LABEL: xnor_v_s_i32_one_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_not_b32_e32 v0, v0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: xnor_v_s_i32_one_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX900-LABEL: xnor_v_s_i32_one_use: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX900-NEXT: v_not_b32_e32 v0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX906-LABEL: xnor_v_s_i32_one_use: +; GFX906: ; %bb.0: +; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 +; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_v_s_i32_one_use: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_xnor_b32_e64 v0, v0, s0 ; GFX10-NEXT: ; return to shader part epilog %xor = xor i32 %v, %s %d = xor i32 %xor, -1 @@ -314,19 +388,15 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) { ; GFX906-LABEL: xnor_i64_s_v_one_use: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX906-NEXT: v_not_b32_e32 v0, v0 -; GFX906-NEXT: v_not_b32_e32 v1, v1 +; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0 +; GFX906-NEXT: v_xnor_b32_e32 v1, s1, v1 ; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_i64_s_v_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: v_not_b32_e32 v1, v1 +; GFX10-NEXT: v_xnor_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_xnor_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog entry: %b = shl i64 %b64, 29 @@ -367,19 +437,15 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) { ; GFX906-LABEL: xnor_i64_v_s_one_use: ; GFX906: ; %bb.0: ; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX906-NEXT: v_not_b32_e32 v0, v0 -; GFX906-NEXT: v_not_b32_e32 v1, v1 +; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0 +; GFX906-NEXT: v_xnor_b32_e64 v1, v1, s1 ; GFX906-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xnor_i64_v_s_one_use: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX10-NEXT: v_not_b32_e32 v0, v0 -; GFX10-NEXT: v_not_b32_e32 v1, v1 +; GFX10-NEXT: v_xnor_b32_e64 v0, v0, s0 +; GFX10-NEXT: v_xnor_b32_e64 v1, v1, s1 ; GFX10-NEXT: ; return to shader part epilog %b = shl i64 %b64, 29 %xor = xor i64 %b, %a @@ -419,7 +485,7 @@ define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) { ; GFX10-LABEL: vector_xor_na_b_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor3_b32 v0, v0, -1, v1 +; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %na = xor i32 %a, -1 @@ -458,7 +524,7 @@ define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) { ; GFX10-LABEL: vector_xor_a_nb_i32_one_use: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor3_b32 v0, v1, -1, v0 +; GFX10-NEXT: v_xnor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %nb = xor i32 %b, -1 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 1b35a89ad7f935..4011c21af69046 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -139,10 +139,6 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) { ; GISEL-LABEL: csh_v4i32: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_and_b32_e32 v4, 31, v4 -; GISEL-NEXT: v_and_b32_e32 v5, 31, v5 -; GISEL-NEXT: v_and_b32_e32 v6, 31, v6 -; GISEL-NEXT: v_and_b32_e32 v7, 31, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v8, v4, v0 ; GISEL-NEXT: v_lshlrev_b32_e32 v9, v5, v1 ; GISEL-NEXT: v_lshlrev_b32_e32 v10, v6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 9f093cc7b5abf2..26a4ea9d8a4b6e 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -230,49 +230,27 @@ entry: } define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { -; GFX67-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_imad_pat_i16: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i16: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -337,11 +315,11 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -363,13 +341,13 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -400,13 +378,13 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -470,42 +448,40 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16: @@ -682,46 +658,43 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v6, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v7, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v3, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v5, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v1, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v4, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v3, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v3i16: @@ -1063,19 +1036,15 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v10, v5, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v9, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v11, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v9, v6, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v11, v7, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1085,60 +1054,60 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v3, v7 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v11, v11, v7, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v5, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v6, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v1, v9 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v2, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v4i16: @@ -1403,47 +1372,26 @@ entry: } define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { -; GFX67-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v3, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_umad_pat_i16: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_mul_u32_u24_e32 v2, v0, v1 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v3, v2 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v1, v3, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_i16: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -1504,11 +1452,11 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -1530,13 +1478,13 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1567,13 +1515,13 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1637,42 +1585,40 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16: @@ -1849,46 +1795,43 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v6, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v7, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v6, v3, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v5, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v0, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v1, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v7, v7, v4, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v3, v2 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v3i16: @@ -2230,19 +2173,15 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v10, v10, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v10, v5, v1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v8, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v9, v9, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v11, v11, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v9, v6, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v11, v7, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2252,60 +2191,60 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX67-GISEL-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v10 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v13, v2, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v10, v10, v5, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v2, v5, 1 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v12, v0, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v8, v8, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2 -; GFX67-GISEL-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX67-GISEL-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v15, v3, v7 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v11, v11, v7, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v3, v7, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v14, v1, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v9, v9, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v6, 1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v12 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v13 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v5, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v14 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v15 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v6, v3 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v8 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v1, v9 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v6 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v7 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v2, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v7 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v6 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v6 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v4, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v7 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v4i16: @@ -4282,49 +4221,27 @@ entry: } define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { -; GFX67-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_imad_pat_i8: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i8: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -4389,11 +4306,11 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -4415,13 +4332,13 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 @@ -4452,13 +4369,13 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 @@ -4524,32 +4441,30 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v5, v3, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v2, v1 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i8: @@ -4655,20 +4570,18 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX10-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i8: @@ -4704,25 +4617,21 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX11-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-LABEL: clpeak_imad_pat_v2i8: @@ -4766,25 +4675,21 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v4, v0, v2, v0 +; GFX1200-GISEL-NEXT: v_mad_u16 v5, v1, v3, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v2, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v1, v3, 1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v6, v4, v2 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v7, v5, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v4, v2, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v3, v5, v3, 1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v6, v0 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v7, v1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i8> %x, @@ -7600,81 +7505,43 @@ entry: } define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { -; GFX67-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_imad_pat_i16_x2: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v3, v0, v2 +; GFX67-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v1, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i16_x2: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -7767,19 +7634,19 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -7807,23 +7674,23 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -7860,23 +7727,23 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -7902,79 +7769,42 @@ entry: } define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { -; GFX67-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v3, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v3, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v2, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: clpeak_umad_pat_i16_x2: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_mul_u32_u24_e32 v2, v0, v1 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v3, v2 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v2, v3, v2, 1 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v2, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v2, v2, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v2, v3 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v1, v2, v3, 1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_i16_x2: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -8063,19 +7893,19 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX10-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -8103,23 +7933,23 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX11-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX11-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -8156,23 +7986,23 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 ; GFX1200-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v2, v3, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v3, v0, v1 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, 1 +; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v2, v3 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 +; GFX1200-GISEL-NEXT: v_mad_u16 v1, v2, v3, 1 ; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -8268,10 +8098,8 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -8279,9 +8107,9 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v3, 1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -8290,64 +8118,60 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v1, v5, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v5, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v3, v5, 1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v2, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: @@ -8591,10 +8415,8 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v5, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v4, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v5, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -8602,9 +8424,9 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v5, v3, 1 ; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v4, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 @@ -8613,64 +8435,60 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v1, v5, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v0, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v5, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v5, v1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX67-GISEL-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v3, v5, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v5, v3, v5, 1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v4, v2, v4, 1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-GISEL-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, 1 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX67-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX67-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: @@ -8908,24 +8726,14 @@ entry: } define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { -; GFX67-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v1, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v1, v3 -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v2 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: multi_use_mul_mad_i16_var: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mad_u32_u24 v0, v4, v1, v2 +; GFX67-NEXT: v_mad_u32_u24 v1, v4, v1, v3 +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_i16_var: ; GFX8-SDAG: ; %bb.0: ; %entry @@ -8973,10 +8781,9 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX10-GISEL-LABEL: multi_use_mul_mad_i16_var: ; GFX10-GISEL: ; %bb.0: ; %entry ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX10-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -8992,12 +8799,10 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX11-GISEL-LABEL: multi_use_mul_mad_i16_var: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX11-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9021,12 +8826,10 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_mad_u16 v2, v0, v1, v2 +; GFX1200-GISEL-NEXT: v_mad_u16 v0, v0, v1, v3 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX1200-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -9108,29 +8911,17 @@ entry: } define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %ptr) { -; GFX67-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX67-SDAG: ; %bb.0: ; %entry -; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX67-SDAG-NEXT: s_mov_b32 m0, -1 -; GFX67-SDAG-NEXT: ds_write_b16 v3, v4 -; GFX67-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX67-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX67-GISEL: ; %bb.0: ; %entry -; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v2 -; GFX67-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX67-GISEL-NEXT: ds_write_b16 v3, v1 -; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX67-LABEL: other_use_mul_mad_i16_var: +; GFX67: ; %bb.0: ; %entry +; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mul_u32_u24_e32 v4, v0, v1 +; GFX67-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX67-NEXT: s_mov_b32 m0, -1 +; GFX67-NEXT: ds_write_b16 v3, v4 +; GFX67-NEXT: s_waitcnt lgkmcnt(0) +; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: other_use_mul_mad_i16_var: ; GFX8: ; %bb.0: ; %entry @@ -9151,69 +8942,36 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) % ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX10-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX10-SDAG-NEXT: ds_write_b16 v3, v4 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX10-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX10-GISEL-NEXT: ds_write_b16 v3, v1 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX11-SDAG-NEXT: ds_store_b16 v3, v4 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX11-GISEL-NEXT: ds_store_b16 v3, v1 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: other_use_mul_mad_i16_var: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX10-NEXT: ds_write_b16 v3, v4 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX1200-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX1200-SDAG: ; %bb.0: ; %entry -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX1200-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX1200-SDAG-NEXT: ds_store_b16 v3, v4 -; GFX1200-SDAG-NEXT: s_wait_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: other_use_mul_mad_i16_var: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX11-NEXT: ds_store_b16 v3, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX1200-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX1200-GISEL: ; %bb.0: ; %entry -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX1200-GISEL-NEXT: ds_store_b16 v3, v1 -; GFX1200-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1200-LABEL: other_use_mul_mad_i16_var: +; GFX1200: ; %bb.0: ; %entry +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: v_mul_lo_u16 v4, v0, v1 +; GFX1200-NEXT: v_mad_u16 v0, v0, v1, v2 +; GFX1200-NEXT: ds_store_b16 v3, v4 +; GFX1200-NEXT: s_wait_dscnt 0x0 +; GFX1200-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z @@ -9246,16 +9004,14 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-LABEL: multi_use_mul_mad_v2i16_var: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v2, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v3, v0, v1 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v2, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v5 -; GFX67-GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX67-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GFX67-GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v8, v2, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v9, v3, v5 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v2, v8, v2, v6 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v3, v9, v3, v7 ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var: @@ -9366,20 +9122,20 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX67-GISEL-LABEL: other_use_mul_mad_v2i16_var: ; GFX67-GISEL: ; %bb.0: ; %entry ; GFX67-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX67-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; GFX67-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX67-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX67-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX67-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v8, v1, v3 +; GFX67-GISEL-NEXT: v_mul_u32_u24_e32 v7, v0, v2 +; GFX67-GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-GISEL-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX67-GISEL-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX67-GISEL-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v0, v0, v2, v4 +; GFX67-GISEL-NEXT: v_mad_u32_u24 v1, v1, v3, v5 ; GFX67-GISEL-NEXT: s_mov_b32 m0, -1 -; GFX67-GISEL-NEXT: ds_write_b32 v6, v2 +; GFX67-GISEL-NEXT: ds_write_b32 v6, v7 ; GFX67-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX67-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -9532,29 +9288,15 @@ define i64 @mul_u24_add64(i32 %x, i32 %y, i64 %z) { ; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX1200-SDAG-LABEL: mul_u24_add64: -; GFX1200-SDAG: ; %bb.0: -; GFX1200-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] -; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1200-GISEL-LABEL: mul_u24_add64: -; GFX1200-GISEL: ; %bb.0: -; GFX1200-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX1200-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1200-GISEL-NEXT: v_mul_u32_u24_e32 v4, v0, v1 -; GFX1200-GISEL-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v1 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 -; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1200-LABEL: mul_u24_add64: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] +; GFX1200-NEXT: s_setpc_b64 s[30:31] %mul = call i64 @llvm.amdgcn.mul.u24.i64(i32 %x, i32 %y) %add = add i64 %mul, %z ret i64 %add diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 83599e789e10b9..84f23985b64213 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -1350,13 +1350,10 @@ Expected GlobalISelEmitter::importExplicitUseRenderer( // Handle the case where the MVT/register class is omitted in the dest pattern // but MVT exists in the source pattern. - if (isa(DstChild.getLeafValue())) { - for (const TreePatternNode &SrcChild : Src.children()) { - if (SrcChild.getName() == DstChild.getName()) { - DstMIBuilder.addRenderer(SrcChild.getName()); - return InsertPt; - } - } + if (isa(DstChild.getLeafValue()) && + Rule.hasOperand(DstChild.getName())) { + DstMIBuilder.addRenderer(DstChild.getName()); + return InsertPt; } return failedImport("Dst pattern child is an unsupported kind"); } From 3146911eb0eee821535444aa207a4ec5020c9c6a Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 18 Dec 2024 11:51:01 +0000 Subject: [PATCH 20/37] [LLVM][AsmPrinter] Add vector ConstantInt/FP support to emitGlobalConstantImpl. (#120077) The fixes a failure path for fixed length vector globals when ConstantInt/FP is used to represent splats instead of ConstantDataVector. --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 32 +++++++++++-------- llvm/lib/IR/Constants.cpp | 7 ++++ ...treaming-mode-fixed-length-splat-vector.ll | 3 +- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 47a93d624dfa9c..d2e60bb7f6318c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3643,10 +3643,11 @@ static void emitGlobalConstantArray(const DataLayout &DL, static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP); -static void emitGlobalConstantVector(const DataLayout &DL, - const ConstantVector *CV, AsmPrinter &AP, +static void emitGlobalConstantVector(const DataLayout &DL, const Constant *CV, + AsmPrinter &AP, AsmPrinter::AliasMapTy *AliasList) { - Type *ElementType = CV->getType()->getElementType(); + auto *VTy = cast(CV->getType()); + Type *ElementType = VTy->getElementType(); uint64_t ElementSizeInBits = DL.getTypeSizeInBits(ElementType); uint64_t ElementAllocSizeInBits = DL.getTypeAllocSizeInBits(ElementType); uint64_t EmittedSize; @@ -3659,7 +3660,7 @@ static void emitGlobalConstantVector(const DataLayout &DL, Type *IntT = IntegerType::get(CV->getContext(), DL.getTypeSizeInBits(CV->getType())); ConstantInt *CI = dyn_cast_or_null(ConstantFoldConstant( - ConstantExpr::getBitCast(const_cast(CV), IntT), DL)); + ConstantExpr::getBitCast(const_cast(CV), IntT), DL)); if (!CI) { report_fatal_error( "Cannot lower vector global with unusual element type"); @@ -3668,12 +3669,11 @@ static void emitGlobalConstantVector(const DataLayout &DL, emitGlobalConstantLargeInt(CI, AP); EmittedSize = DL.getTypeStoreSize(CV->getType()); } else { - for (unsigned I = 0, E = CV->getType()->getNumElements(); I != E; ++I) { + for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) { emitGlobalAliasInline(AP, DL.getTypeAllocSize(CV->getType()) * I, AliasList); - emitGlobalConstantImpl(DL, CV->getOperand(I), AP); + emitGlobalConstantImpl(DL, CV->getAggregateElement(I), AP); } - EmittedSize = - DL.getTypeAllocSize(ElementType) * CV->getType()->getNumElements(); + EmittedSize = DL.getTypeAllocSize(ElementType) * VTy->getNumElements(); } unsigned Size = DL.getTypeAllocSize(CV->getType()); @@ -3943,8 +3943,10 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, return AP.OutStreamer->emitZeros(Size); if (const ConstantInt *CI = dyn_cast(CV)) { - const uint64_t StoreSize = DL.getTypeStoreSize(CV->getType()); + if (isa(CV->getType())) + return emitGlobalConstantVector(DL, CV, AP, AliasList); + const uint64_t StoreSize = DL.getTypeStoreSize(CV->getType()); if (StoreSize <= 8) { if (AP.isVerbose()) AP.OutStreamer->getCommentOS() @@ -3961,8 +3963,12 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, return; } - if (const ConstantFP *CFP = dyn_cast(CV)) - return emitGlobalConstantFP(CFP, AP); + if (const ConstantFP *CFP = dyn_cast(CV)) { + if (isa(CV->getType())) + return emitGlobalConstantVector(DL, CV, AP, AliasList); + else + return emitGlobalConstantFP(CFP, AP); + } if (isa(CV)) { AP.OutStreamer->emitIntValue(0, Size); @@ -3994,8 +4000,8 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV, } } - if (const ConstantVector *V = dyn_cast(CV)) - return emitGlobalConstantVector(DL, V, AP, AliasList); + if (isa(CV)) + return emitGlobalConstantVector(DL, CV, AP, AliasList); // Otherwise, it must be a ConstantExpr. Lower it to an MCExpr, then emit it // thread the streamer with EmitValue. diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 949c23609c9d05..db5effbd9a43e7 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -451,6 +451,13 @@ Constant *Constant::getAggregateElement(unsigned Elt) const { ? ConstantInt::get(getContext(), CI->getValue()) : nullptr; + if (const auto *CFP = dyn_cast(this)) + return Elt < cast(getType()) + ->getElementCount() + .getKnownMinValue() + ? ConstantFP::get(getContext(), CFP->getValue()) + : nullptr; + // FIXME: getNumElements() will fail for non-fixed vector types. if (isa(getType())) return nullptr; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll index a4cf5d608fed6d..96be762b4c8f67 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE - - +; RUN: llc -force-streaming-compatible -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s --check-prefix=NONEON-NOSVE target triple = "aarch64-unknown-linux-gnu" From bc3eee11ea6f771bf007c4921a34c1dfee040471 Mon Sep 17 00:00:00 2001 From: AnastasiyaChernikova Date: Wed, 18 Dec 2024 14:53:30 +0300 Subject: [PATCH 21/37] [Exegesis][RISCV] Add RISCV support for llvm-exegesis (#89047) This patch also makes following amendments to core exegesis: * Added distinction between regular registers aliasing check and registers used as memory address in instruction. * Added scratch memory space pointer register. * General exegesis options were amended: * mattr - new option to pass a list of enabled target features Llvm-exegesis RISCV port is a result of team effort. Below everyone involved listed. Co-authored-by: Konstantin Vladimirov Co-authored-by: Dmitrii Petrov Co-authored-by: Dmitry Bushev Co-authored-by: Mark Goncharov Co-authored-by: Anastasiya Chernikova --------- Co-authored-by: Dmitry Bushev --- .../RISCV/latency-by-extension-A.s | 59 ++++ .../RISCV/latency-by-extension-C.s | 48 +++ .../RISCV/latency-by-opcode-name-FADD_D.s | 11 + llvm/tools/llvm-exegesis/lib/CMakeLists.txt | 3 + .../llvm-exegesis/lib/MCInstrDescView.cpp | 18 +- .../tools/llvm-exegesis/lib/MCInstrDescView.h | 11 +- .../llvm-exegesis/lib/RISCV/CMakeLists.txt | 22 ++ llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp | 275 ++++++++++++++++++ .../lib/SerialSnippetGenerator.cpp | 16 +- llvm/tools/llvm-exegesis/lib/SnippetFile.cpp | 10 +- .../llvm-exegesis/lib/SnippetGenerator.cpp | 12 +- llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 33 ++- 12 files changed, 486 insertions(+), 32 deletions(-) create mode 100644 llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s create mode 100644 llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s create mode 100644 llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s create mode 100644 llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt create mode 100644 llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp diff --git a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s new file mode 100644 index 00000000000000..bdc02d4af21551 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s @@ -0,0 +1,59 @@ +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOAND_D -mattr="+a" | FileCheck --check-prefix=AMOAND_D %s + +AMOAND_D: --- +AMOAND_D-NEXT: mode: latency +AMOAND_D-NEXT: key: +AMOAND_D-NEXT: instructions: +AMOAND_D-NEXT: - 'AMOAND_D [[RE01:X[0-9]+]] X10 [[RE01:X[0-9]+]]' +AMOAND_D-NEXT: config: '' +AMOAND_D-NEXT: register_initial_values: +AMOAND_D-NEXT: - '[[RE01:X[0-9]+]]=0x0' +AMOAND_D-DAG: ... + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOADD_W -mattr="+a" | FileCheck --check-prefix=AMOADD_W %s + +AMOADD_W: --- +AMOADD_W-NEXT: mode: latency +AMOADD_W-NEXT: key: +AMOADD_W-NEXT: instructions: +AMOADD_W-NEXT: - 'AMOADD_W [[RE02:X[0-9]+]] X10 [[RE02:X[0-9]+]]' +AMOADD_W-NEXT: config: '' +AMOADD_W-NEXT: register_initial_values: +AMOADD_W-NEXT: - '[[RE02:X[0-9]+]]=0x0' +AMOADD_W-DAG: ... + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOMAXU_D -mattr="+a" | FileCheck --check-prefix=AMOMAXU_D %s + +AMOMAXU_D: --- +AMOMAXU_D-NEXT: mode: latency +AMOMAXU_D-NEXT: key: +AMOMAXU_D-NEXT: instructions: +AMOMAXU_D-NEXT: - 'AMOMAXU_D [[RE03:X[0-9]+]] X10 [[RE03:X[0-9]+]]' +AMOMAXU_D-NEXT: config: '' +AMOMAXU_D-NEXT: register_initial_values: +AMOMAXU_D-NEXT: - '[[RE03:X[0-9]+]]=0x0' +AMOMAXU_D-DAG: ... + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOMIN_W -mattr="+a" | FileCheck --check-prefix=AMOMIN_W %s + +AMOMIN_W: --- +AMOMIN_W-NEXT: mode: latency +AMOMIN_W-NEXT: key: +AMOMIN_W-NEXT: instructions: +AMOMIN_W-NEXT: - 'AMOMIN_W [[RE04:X[0-9]+]] X10 [[RE04:X[0-9]+]]' +AMOMIN_W-NEXT: config: '' +AMOMIN_W-NEXT: register_initial_values: +AMOMIN_W-NEXT: - '[[RE04:X[0-9]+]]=0x0' +AMOMIN_W-DAG: ... + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOXOR_D -mattr="+a" | FileCheck --check-prefix=AMOXOR_D %s + +AMOXOR_D: --- +AMOXOR_D-NEXT: mode: latency +AMOXOR_D-NEXT: key: +AMOXOR_D-NEXT: instructions: +AMOXOR_D-NEXT: - 'AMOXOR_D [[RE05:X[0-9]+]] X10 [[RE05:X[0-9]+]]' +AMOXOR_D-NEXT: config: '' +AMOXOR_D-NEXT: register_initial_values: +AMOXOR_D-NEXT: - '[[RE05:X[0-9]+]]=0x0' +AMOXOR_D-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s new file mode 100644 index 00000000000000..9e94f024ed1162 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s @@ -0,0 +1,48 @@ +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_ADDI -mattr=+c | FileCheck --check-prefix=C_ADDI %s + +C_ADDI: --- +C_ADDI-NEXT: mode: latency +C_ADDI-NEXT: key: +C_ADDI-NEXT: instructions: +C_ADDI-NEXT: - 'C_ADDI [[REG01:X[0-9]+]] [[RE02:X[0-9]+]] [[IMM0:i_0x[0-9]+]]' + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_ADDIW -mattr=+c | FileCheck --check-prefix=C_ADDIW %s + +C_ADDIW: --- +C_ADDIW-NEXT: mode: latency +C_ADDIW-NEXT: key: +C_ADDIW-NEXT: instructions: +C_ADDIW-NEXT: - 'C_ADDIW [[REG11:X[0-9]+]] [[RE12:X[0-9]+]] [[IMM1:i_0x[0-9]+]]' + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_ANDI -mattr=+c | FileCheck --check-prefix=C_ANDI %s + +C_ANDI: --- +C_ANDI-NEXT: mode: latency +C_ANDI-NEXT: key: +C_ANDI-NEXT: instructions: +C_ANDI-NEXT: - 'C_ANDI [[REG31:X[0-9]+]] [[REG32:X[0-9]+]] [[IMM3:i_0x[0-9]+]]' + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_SLLI -mattr=+c | FileCheck --check-prefix=C_SLLI %s + +C_SLLI: --- +C_SLLI-NEXT: mode: latency +C_SLLI-NEXT: key: +C_SLLI-NEXT: instructions: +C_SLLI-NEXT: - 'C_SLLI [[REG81:X[0-9]+]] [[REG82:X[0-9]+]] [[IMM8:i_0x[0-9]+]]' + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_SRAI -mattr=+c | FileCheck --check-prefix=C_SRAI %s + +C_SRAI: --- +C_SRAI-NEXT: mode: latency +C_SRAI-NEXT: key: +C_SRAI-NEXT: instructions: +C_SRAI-NEXT: - 'C_SRAI [[REG91:X[0-9]+]] [[REG92:X[0-9]+]] [[IMM9:i_0x[0-9]+]]' + +# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_SRLI -mattr=+c | FileCheck --check-prefix=C_SRLI %s + +C_SRLI: --- +C_SRLI-NEXT: mode: latency +C_SRLI-NEXT: key: +C_SRLI-NEXT: instructions: +C_SRLI-NEXT: - 'C_SRLI [[REG101:X[0-9]+]] [[REG102:X[0-9]+]] [[IMM10:i_0x[0-9]+]]' +C_SRLI-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s new file mode 100644 index 00000000000000..2dea89cca4d7e9 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s @@ -0,0 +1,11 @@ +# RUN: llvm-exegesis -mtriple=riscv64-unknown-linux-gnu --mcpu=generic -mode=latency --benchmark-phase=assemble-measured-code -mattr=+d -opcode-name=FADD_D | FileCheck %s + +CHECK: --- +CHECK-NEXT: mode: latency +CHECK-NEXT: key: +CHECK-NEXT: instructions: +CHECK-NEXT: - 'FADD_D [[REG1:F[0-9]+_D]] [[REG2:F[0-9]+_D]] [[REG3:F[0-9]+_D]] i_0x7' +CHECK-NEXT: config: '' +CHECK-NEXT: register_initial_values: +CHECK-DAG: - '[[REG1]]=0x0' +CHECK-DAG: ... diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt index 414b49e5e021c2..d95c37ff5426bd 100644 --- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt @@ -12,6 +12,9 @@ endif() if (LLVM_TARGETS_TO_BUILD MATCHES "Mips") list(APPEND LLVM_EXEGESIS_TARGETS "Mips") endif() +if(LLVM_TARGETS_TO_BUILD MATCHES "RISCV") + list(APPEND LLVM_EXEGESIS_TARGETS "RISCV") +endif() set(LLVM_EXEGESIS_TARGETS ${LLVM_EXEGESIS_TARGETS} PARENT_SCOPE) diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp index 9c926d1fc61124..c9225e51213e59 100644 --- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp +++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp @@ -95,11 +95,12 @@ Instruction::Instruction(const MCInstrDesc *Description, StringRef Name, const BitVector *ImplDefRegs, const BitVector *ImplUseRegs, const BitVector *AllDefRegs, - const BitVector *AllUseRegs) + const BitVector *AllUseRegs, + const BitVector *NonMemoryRegs) : Description(*Description), Name(Name), Operands(std::move(Operands)), Variables(std::move(Variables)), ImplDefRegs(*ImplDefRegs), ImplUseRegs(*ImplUseRegs), AllDefRegs(*AllDefRegs), - AllUseRegs(*AllUseRegs) {} + AllUseRegs(*AllUseRegs), NonMemoryRegs(*NonMemoryRegs) {} std::unique_ptr Instruction::create(const MCInstrInfo &InstrInfo, @@ -166,6 +167,8 @@ Instruction::create(const MCInstrInfo &InstrInfo, BitVector ImplUseRegs = RATC.emptyRegisters(); BitVector AllDefRegs = RATC.emptyRegisters(); BitVector AllUseRegs = RATC.emptyRegisters(); + BitVector NonMemoryRegs = RATC.emptyRegisters(); + for (const auto &Op : Operands) { if (Op.isReg()) { const auto &AliasingBits = Op.getRegisterAliasing().aliasedBits(); @@ -177,6 +180,8 @@ Instruction::create(const MCInstrInfo &InstrInfo, ImplDefRegs |= AliasingBits; if (Op.isUse() && Op.isImplicit()) ImplUseRegs |= AliasingBits; + if (Op.isUse() && !Op.isMemory()) + NonMemoryRegs |= AliasingBits; } } // Can't use make_unique because constructor is private. @@ -185,7 +190,8 @@ Instruction::create(const MCInstrInfo &InstrInfo, std::move(Variables), BVC.getUnique(std::move(ImplDefRegs)), BVC.getUnique(std::move(ImplUseRegs)), BVC.getUnique(std::move(AllDefRegs)), - BVC.getUnique(std::move(AllUseRegs)))); + BVC.getUnique(std::move(AllUseRegs)), + BVC.getUnique(std::move(NonMemoryRegs)))); } const Operand &Instruction::getPrimaryOperand(const Variable &Var) const { @@ -240,6 +246,12 @@ bool Instruction::hasAliasingRegisters( ForbiddenRegisters); } +bool Instruction::hasAliasingNotMemoryRegisters( + const BitVector &ForbiddenRegisters) const { + return anyCommonExcludingForbidden(AllDefRegs, NonMemoryRegs, + ForbiddenRegisters); +} + bool Instruction::hasOneUseOrOneDef() const { return AllDefRegs.count() || AllUseRegs.count(); } diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h index f8ebc07d01f35e..d7712e21c32c1c 100644 --- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h +++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h @@ -133,6 +133,12 @@ struct Instruction { // aliasing Use and Def registers. bool hasAliasingRegisters(const BitVector &ForbiddenRegisters) const; + // Whether this instruction is self aliasing through some registers. + // Repeating this instruction may execute sequentially by picking aliasing + // Def and Not Memory Use registers. It may also execute in parallel by + // picking non aliasing Def and Not Memory Use registers. + bool hasAliasingNotMemoryRegisters(const BitVector &ForbiddenRegisters) const; + // Whether this instruction's registers alias with OtherInstr's registers. bool hasAliasingRegistersThrough(const Instruction &OtherInstr, const BitVector &ForbiddenRegisters) const; @@ -160,12 +166,15 @@ struct Instruction { const BitVector &ImplUseRegs; // The set of aliased implicit use registers. const BitVector &AllDefRegs; // The set of all aliased def registers. const BitVector &AllUseRegs; // The set of all aliased use registers. + // The set of all aliased not memory use registers. + const BitVector &NonMemoryRegs; + private: Instruction(const MCInstrDesc *Description, StringRef Name, SmallVector Operands, SmallVector Variables, const BitVector *ImplDefRegs, const BitVector *ImplUseRegs, const BitVector *AllDefRegs, - const BitVector *AllUseRegs); + const BitVector *AllUseRegs, const BitVector *NonMemoryRegs); }; // Instructions are expensive to instantiate. This class provides a cache of diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt new file mode 100644 index 00000000000000..489ac6d6e34b33 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt @@ -0,0 +1,22 @@ +include_directories( + ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV + ${LLVM_BINARY_DIR}/lib/Target/RISCV +) + +set(LLVM_LINK_COMPONENTS + CodeGen + RISCV + Exegesis + Core + Support + ) + +add_llvm_library(LLVMExegesisRISCV + DISABLE_LLVM_LINK_LLVM_DYLIB + STATIC + Target.cpp + + DEPENDS + intrinsics_gen + RISCVCommonTableGen + ) diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp new file mode 100644 index 00000000000000..891818b625fe14 --- /dev/null +++ b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp @@ -0,0 +1,275 @@ +//===-- Target.cpp ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../Target.h" + +#include "MCTargetDesc/RISCVBaseInfo.h" +#include "MCTargetDesc/RISCVMCTargetDesc.h" +#include "MCTargetDesc/RISCVMatInt.h" +#include "RISCVInstrInfo.h" + +// include computeAvailableFeatures and computeRequiredFeatures. +#define GET_AVAILABLE_OPCODE_CHECKER +#include "RISCVGenInstrInfo.inc" + +#include "llvm/CodeGen/MachineInstrBuilder.h" + +#include + +namespace llvm { +namespace exegesis { + +namespace { + +// Stores constant value to a general-purpose (integer) register. +static std::vector loadIntReg(const MCSubtargetInfo &STI, unsigned Reg, + const APInt &Value) { + SmallVector MCInstSeq; + std::vector MatIntInstrs; + MCRegister DestReg = Reg; + + RISCVMatInt::generateMCInstSeq(Value.getSExtValue(), STI, DestReg, MCInstSeq); + MatIntInstrs.resize(MCInstSeq.size()); + std::copy(MCInstSeq.begin(), MCInstSeq.end(), MatIntInstrs.begin()); + + return MatIntInstrs; +} + +const unsigned ScratchIntReg = RISCV::X30; // t5 + +// Stores constant bits to a floating-point register. +static std::vector loadFPRegBits(const MCSubtargetInfo &STI, + unsigned Reg, const APInt &Bits, + unsigned FmvOpcode) { + std::vector Instrs = loadIntReg(STI, ScratchIntReg, Bits); + Instrs.push_back(MCInstBuilder(FmvOpcode).addReg(Reg).addReg(ScratchIntReg)); + return Instrs; +} + +// main idea is: +// we support APInt only if (represented as double) it has zero fractional +// part: 1.0, 2.0, 3.0, etc... then we can do the trick: write int to tmp reg t5 +// and then do FCVT this is only reliable thing in 32-bit mode, otherwise we +// need to use __floatsidf +static std::vector loadFP64RegBits32(const MCSubtargetInfo &STI, + unsigned Reg, const APInt &Bits) { + double D = Bits.bitsToDouble(); + double IPart; + double FPart = std::modf(D, &IPart); + + if (std::abs(FPart) > std::numeric_limits::epsilon()) { + errs() << "loadFP64RegBits32 is not implemented for doubles like " << D + << ", please remove fractional part\n"; + return {}; + } + + std::vector Instrs = loadIntReg(STI, ScratchIntReg, Bits); + Instrs.push_back( + MCInstBuilder(RISCV::FCVT_D_W).addReg(Reg).addReg(ScratchIntReg)); + return Instrs; +} + +static MCInst nop() { + // ADDI X0, X0, 0 + return MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X0) + .addReg(RISCV::X0) + .addImm(0); +} + +static bool isVectorRegList(unsigned Reg) { + return RISCV::VRM2RegClass.contains(Reg) || + RISCV::VRM4RegClass.contains(Reg) || + RISCV::VRM8RegClass.contains(Reg) || + RISCV::VRN2M1RegClass.contains(Reg) || + RISCV::VRN2M2RegClass.contains(Reg) || + RISCV::VRN2M4RegClass.contains(Reg) || + RISCV::VRN3M1RegClass.contains(Reg) || + RISCV::VRN3M2RegClass.contains(Reg) || + RISCV::VRN4M1RegClass.contains(Reg) || + RISCV::VRN4M2RegClass.contains(Reg) || + RISCV::VRN5M1RegClass.contains(Reg) || + RISCV::VRN6M1RegClass.contains(Reg) || + RISCV::VRN7M1RegClass.contains(Reg) || + RISCV::VRN8M1RegClass.contains(Reg); +} + +class ExegesisRISCVTarget : public ExegesisTarget { +public: + ExegesisRISCVTarget(); + + bool matchesArch(Triple::ArchType Arch) const override; + + std::vector setRegTo(const MCSubtargetInfo &STI, unsigned Reg, + const APInt &Value) const override; + + unsigned getDefaultLoopCounterRegister(const Triple &) const override; + + void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, + const MCInstrInfo &MII, + unsigned LoopRegister) const override; + + unsigned getScratchMemoryRegister(const Triple &TT) const override; + + void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, + unsigned Offset) const override; + + ArrayRef getUnavailableRegisters() const override; + + Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, + MCOperand &AssignedValue, + const BitVector &ForbiddenRegs) const override; + + std::vector + generateInstructionVariants(const Instruction &Instr, + unsigned MaxConfigsPerOpcode) const override; +}; + +ExegesisRISCVTarget::ExegesisRISCVTarget() + : ExegesisTarget(ArrayRef{}, + RISCV_MC::isOpcodeAvailable) {} + +#define GET_REGISTER_MATCHER +#include "RISCVGenAsmMatcher.inc" + +bool ExegesisRISCVTarget::matchesArch(Triple::ArchType Arch) const { + return Arch == Triple::riscv32 || Arch == Triple::riscv64; +} + +std::vector ExegesisRISCVTarget::setRegTo(const MCSubtargetInfo &STI, + unsigned Reg, + const APInt &Value) const { + if (RISCV::GPRRegClass.contains(Reg)) + return loadIntReg(STI, Reg, Value); + if (RISCV::FPR16RegClass.contains(Reg)) + return loadFPRegBits(STI, Reg, Value, RISCV::FMV_H_X); + if (RISCV::FPR32RegClass.contains(Reg)) + return loadFPRegBits(STI, Reg, Value, RISCV::FMV_W_X); + if (RISCV::FPR64RegClass.contains(Reg)) { + if (STI.hasFeature(RISCV::Feature64Bit)) + return loadFPRegBits(STI, Reg, Value, RISCV::FMV_D_X); + return loadFP64RegBits32(STI, Reg, Value); + } + if (Reg == RISCV::FRM || Reg == RISCV::VL || Reg == RISCV::VLENB || + Reg == RISCV::VTYPE || RISCV::GPRPairRegClass.contains(Reg) || + RISCV::VRRegClass.contains(Reg) || isVectorRegList(Reg)) { + // Don't initialize: + // - FRM + // - VL, VLENB, VTYPE + // - vector registers (and vector register lists) + // - Zfinx registers + // Generate 'NOP' so that exegesis treats such registers as initialized + // (it tries to initialize them with '0' anyway). + return {nop()}; + } + errs() << "setRegTo is not implemented for Reg " << Reg + << ", results will be unreliable\n"; + return {}; +} + +const unsigned DefaultLoopCounterReg = RISCV::X31; // t6 +const unsigned ScratchMemoryReg = RISCV::X10; // a0 + +unsigned +ExegesisRISCVTarget::getDefaultLoopCounterRegister(const Triple &) const { + return DefaultLoopCounterReg; +} + +void ExegesisRISCVTarget::decrementLoopCounterAndJump( + MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, + const MCInstrInfo &MII, unsigned LoopRegister) const { + BuildMI(&MBB, DebugLoc(), MII.get(RISCV::ADDI)) + .addDef(LoopRegister) + .addUse(LoopRegister) + .addImm(-1); + BuildMI(&MBB, DebugLoc(), MII.get(RISCV::BNE)) + .addUse(LoopRegister) + .addUse(RISCV::X0) + .addMBB(&TargetMBB); +} + +unsigned ExegesisRISCVTarget::getScratchMemoryRegister(const Triple &TT) const { + return ScratchMemoryReg; // a0 +} + +void ExegesisRISCVTarget::fillMemoryOperands(InstructionTemplate &IT, + unsigned Reg, + unsigned Offset) const { + // TODO: for now we ignore Offset because have no way + // to detect it in instruction. + auto &I = IT.getInstr(); + + auto MemOpIt = + find_if(I.Operands, [](const Operand &Op) { return Op.isMemory(); }); + assert(MemOpIt != I.Operands.end() && + "Instruction must have memory operands"); + + const Operand &MemOp = *MemOpIt; + + assert(MemOp.isReg() && "Memory operand expected to be register"); + + IT.getValueFor(MemOp) = MCOperand::createReg(Reg); +} + +const unsigned UnavailableRegisters[4] = {RISCV::X0, DefaultLoopCounterReg, + ScratchIntReg, ScratchMemoryReg}; + +ArrayRef ExegesisRISCVTarget::getUnavailableRegisters() const { + return UnavailableRegisters; +} + +Error ExegesisRISCVTarget::randomizeTargetMCOperand( + const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, + const BitVector &ForbiddenRegs) const { + uint8_t OperandType = + Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType; + + switch (OperandType) { + case RISCVOp::OPERAND_FRMARG: + AssignedValue = MCOperand::createImm(RISCVFPRndMode::DYN); + break; + case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO: + AssignedValue = MCOperand::createImm(0b1 << 4); + break; + case RISCVOp::OPERAND_SIMM6_NONZERO: + case RISCVOp::OPERAND_UIMMLOG2XLEN_NONZERO: + AssignedValue = MCOperand::createImm(1); + break; + default: + if (OperandType >= RISCVOp::OPERAND_FIRST_RISCV_IMM && + OperandType <= RISCVOp::OPERAND_LAST_RISCV_IMM) + AssignedValue = MCOperand::createImm(0); + } + return Error::success(); +} + +std::vector +ExegesisRISCVTarget::generateInstructionVariants( + const Instruction &Instr, unsigned int MaxConfigsPerOpcode) const { + InstructionTemplate IT{&Instr}; + for (const Operand &Op : Instr.Operands) + if (Op.isMemory()) { + IT.getValueFor(Op) = MCOperand::createReg(ScratchMemoryReg); + } + return {IT}; +} + +} // anonymous namespace + +static ExegesisTarget *getTheRISCVExegesisTarget() { + static ExegesisRISCVTarget Target; + return &Target; +} + +void InitializeRISCVExegesisTarget() { + ExegesisTarget::registerTarget(getTheRISCVExegesisTarget()); +} + +} // namespace exegesis +} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp index 7100b51bbb7298..9573e2242ad3f7 100644 --- a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp @@ -54,12 +54,6 @@ computeAliasingInstructions(const LLVMState &State, const Instruction *Instr, continue; const Instruction &OtherInstr = State.getIC().getInstr(OtherOpcode); const MCInstrDesc &OtherInstrDesc = OtherInstr.Description; - // Ignore instructions that we cannot run. - if (OtherInstrDesc.isPseudo() || OtherInstrDesc.usesCustomInsertionHook() || - OtherInstrDesc.isBranch() || OtherInstrDesc.isIndirectBranch() || - OtherInstrDesc.isCall() || OtherInstrDesc.isReturn()) { - continue; - } if (OtherInstr.hasMemoryOperands()) continue; if (!ET.allowAsBackToBack(OtherInstr)) @@ -81,12 +75,10 @@ static ExecutionMode getExecutionModes(const Instruction &Instr, EM |= ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS; if (Instr.hasMemoryOperands()) EM |= ExecutionMode::SERIAL_VIA_MEMORY_INSTR; - else { - if (Instr.hasAliasingRegisters(ForbiddenRegisters)) - EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS; - if (Instr.hasOneUseOrOneDef()) - EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR; - } + if (Instr.hasAliasingNotMemoryRegisters(ForbiddenRegisters)) + EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS; + if (Instr.hasOneUseOrOneDef()) + EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR; return EM; } diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp index b37999ab017f59..282bc8ca912492 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp @@ -37,10 +37,10 @@ namespace { // An MCStreamer that reads a BenchmarkCode definition from a file. class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer { public: - explicit BenchmarkCodeStreamer(MCContext *Context, const LLVMState &State, + explicit BenchmarkCodeStreamer(const ExegesisTarget &Target, + MCContext *Context, const LLVMState &State, BenchmarkCode *Result) - : MCStreamer(*Context), State(State), Result(Result) {} - + : MCStreamer(*Context), Target(Target), State(State), Result(Result) {} // Implementation of the MCStreamer interface. We only care about // instructions. void emitInstruction(const MCInst &Instruction, @@ -218,6 +218,7 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer { return *RegisterNumber; } + const ExegesisTarget &Target; const LLVMState &State; BenchmarkCode *const Result; unsigned InvalidComments = 0; @@ -251,7 +252,8 @@ Expected> readSnippets(const LLVMState &State, TM.getTarget().createMCObjectFileInfo(Context, /*PIC=*/false)); Context.setObjectFileInfo(ObjectFileInfo.get()); Context.initInlineSourceManager(); - BenchmarkCodeStreamer Streamer(&Context, State, &Result); + BenchmarkCodeStreamer Streamer(State.getExegesisTarget(), &Context, State, + &Result); std::string Error; raw_string_ostream ErrorStream(Error); diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp index 7dcff60a8fd11f..48357d443f713e 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -73,6 +73,9 @@ Error SnippetGenerator::generateConfigurations( for (CodeTemplate &CT : Templates) { // TODO: Generate as many BenchmarkCode as needed. { + CT.ScratchSpacePointerInReg = + State.getExegesisTarget().getScratchMemoryRegister( + State.getTargetMachine().getTargetTriple()); BenchmarkCode BC; BC.Info = CT.Info; BC.Key.Instructions.reserve(CT.Instructions.size()); @@ -108,6 +111,12 @@ std::vector SnippetGenerator::computeRegisterInitialValues( // Loop invariant: DefinedRegs[i] is true iif it has been set at least once // before the current instruction. BitVector DefinedRegs = State.getRATC().emptyRegisters(); + // If target always expects a scratch memory register as live input, + // mark it as defined. + const ExegesisTarget &Target = State.getExegesisTarget(); + unsigned ScratchMemoryReg = Target.getScratchMemoryRegister( + State.getTargetMachine().getTargetTriple()); + DefinedRegs.set(ScratchMemoryReg); std::vector RIV; for (const InstructionTemplate &IT : Instructions) { // Returns the register that this Operand sets or uses, or 0 if this is not @@ -200,7 +209,8 @@ static void setRegisterOperandValue(const RegisterOperandAssignment &ROV, if (ROV.Op->isExplicit()) { auto &AssignedValue = IB.getValueFor(*ROV.Op); if (AssignedValue.isValid()) { - assert(AssignedValue.isReg() && AssignedValue.getReg() == ROV.Reg); + // TODO don't re-assign register operands which are already "locked" + // by Target in corresponding InstructionTemplate return; } AssignedValue = MCOperand::createReg(ROV.Reg); diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 546ec770a8d221..fa37e05956be8c 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -274,6 +274,10 @@ static cl::opt BenchmarkProcessCPU( cl::desc("The CPU number that the benchmarking process should executon on"), cl::cat(BenchmarkOptions), cl::init(-1)); +static cl::opt MAttr( + "mattr", cl::desc("comma-separated list of target architecture features"), + cl::value_desc("+feature1,-feature2,..."), cl::cat(Options), cl::init("")); + static ExitOnError ExitOnErr("llvm-exegesis error: "); // Helper function that logs the error(s) and exits. @@ -296,6 +300,18 @@ T ExitOnFileError(const Twine &FileName, Expected &&E) { return std::move(*E); } +static const char *getIgnoredOpcodeReasonOrNull(const LLVMState &State, + unsigned Opcode) { + const MCInstrDesc &InstrDesc = State.getIC().getInstr(Opcode).Description; + if (InstrDesc.isPseudo() || InstrDesc.usesCustomInsertionHook()) + return "Unsupported opcode: isPseudo/usesCustomInserter"; + if (InstrDesc.isBranch() || InstrDesc.isIndirectBranch()) + return "Unsupported opcode: isBranch/isIndirectBranch"; + if (InstrDesc.isCall() || InstrDesc.isReturn()) + return "Unsupported opcode: isCall/isReturn"; + return nullptr; +} + // Checks that only one of OpcodeNames, OpcodeIndex or SnippetsFile is provided, // and returns the opcode indices or {} if snippets should be read from // `SnippetsFile`. @@ -334,6 +350,7 @@ static std::vector getOpcodesOrDie(const LLVMState &State) { return I->getSecond(); return 0u; }; + SmallVector Pieces; StringRef(OpcodeNames.getValue()) .split(Pieces, ",", /* MaxSplit */ -1, /* KeepEmpty */ false); @@ -352,17 +369,11 @@ static std::vector getOpcodesOrDie(const LLVMState &State) { static Expected> generateSnippets(const LLVMState &State, unsigned Opcode, const BitVector &ForbiddenRegs) { - const Instruction &Instr = State.getIC().getInstr(Opcode); - const MCInstrDesc &InstrDesc = Instr.Description; // Ignore instructions that we cannot run. - if (InstrDesc.isPseudo() || InstrDesc.usesCustomInsertionHook()) - return make_error( - "Unsupported opcode: isPseudo/usesCustomInserter"); - if (InstrDesc.isBranch() || InstrDesc.isIndirectBranch()) - return make_error("Unsupported opcode: isBranch/isIndirectBranch"); - if (InstrDesc.isCall() || InstrDesc.isReturn()) - return make_error("Unsupported opcode: isCall/isReturn"); + if (const char *Reason = getIgnoredOpcodeReasonOrNull(State, Opcode)) + return make_error(Reason); + const Instruction &Instr = State.getIC().getInstr(Opcode); const std::vector InstructionVariants = State.getExegesisTarget().generateInstructionVariants( Instr, MaxConfigsPerOpcode); @@ -485,8 +496,8 @@ void benchmarkMain() { LLVMInitialize##TargetName##AsmParser(); #include "llvm/Config/TargetExegesis.def" - const LLVMState State = - ExitOnErr(LLVMState::Create(TripleName, MCPU, "", UseDummyPerfCounters)); + const LLVMState State = ExitOnErr( + LLVMState::Create(TripleName, MCPU, MAttr, UseDummyPerfCounters)); // Preliminary check to ensure features needed for requested // benchmark mode are present on target CPU and/or OS. From f270c9a7d0add028bcb80df5a3d73b85b0ebe7f4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Dec 2024 11:34:53 +0000 Subject: [PATCH 22/37] [X86] urem-seteq-illegal-types.ll - regenerate VPTERNLOG comment --- llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index b4e91da920a2fd..28ac4496acb9be 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -245,7 +245,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047] ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogd $200, %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm2 & (xmm0 | xmm1) ; AVX512VL-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 ; AVX512VL-NEXT: kshiftrw $1, %k0, %k1 ; AVX512VL-NEXT: kmovw %k1, %edx From 2fa4b502d1910b8f134e01274d3898a265b0c88b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Dec 2024 12:23:50 +0000 Subject: [PATCH 23/37] Fix unused variable warning. NFC. --- llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp index 9573e2242ad3f7..25cdf1ce66d449 100644 --- a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp @@ -53,7 +53,6 @@ computeAliasingInstructions(const LLVMState &State, const Instruction *Instr, if (OtherOpcode == Instr->Description.getOpcode()) continue; const Instruction &OtherInstr = State.getIC().getInstr(OtherOpcode); - const MCInstrDesc &OtherInstrDesc = OtherInstr.Description; if (OtherInstr.hasMemoryOperands()) continue; if (!ET.allowAsBackToBack(OtherInstr)) From bf62ea40eee82794abc8ed767c150d6c8d0c0b0a Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Wed, 18 Dec 2024 20:34:51 +0800 Subject: [PATCH 24/37] Revert "[Exegesis][RISCV] Add RISCV support for llvm-exegesis (#89047)" This reverts commit bc3eee11ea6f771bf007c4921a34c1dfee040471. These tests are failing because of no `REQUIRES`. --- .../RISCV/latency-by-extension-A.s | 59 ---- .../RISCV/latency-by-extension-C.s | 48 --- .../RISCV/latency-by-opcode-name-FADD_D.s | 11 - llvm/tools/llvm-exegesis/lib/CMakeLists.txt | 3 - .../llvm-exegesis/lib/MCInstrDescView.cpp | 18 +- .../tools/llvm-exegesis/lib/MCInstrDescView.h | 11 +- .../llvm-exegesis/lib/RISCV/CMakeLists.txt | 22 -- llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp | 275 ------------------ .../lib/SerialSnippetGenerator.cpp | 17 +- llvm/tools/llvm-exegesis/lib/SnippetFile.cpp | 10 +- .../llvm-exegesis/lib/SnippetGenerator.cpp | 12 +- llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 33 +-- 12 files changed, 33 insertions(+), 486 deletions(-) delete mode 100644 llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s delete mode 100644 llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s delete mode 100644 llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s delete mode 100644 llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt delete mode 100644 llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp diff --git a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s deleted file mode 100644 index bdc02d4af21551..00000000000000 --- a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-A.s +++ /dev/null @@ -1,59 +0,0 @@ -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOAND_D -mattr="+a" | FileCheck --check-prefix=AMOAND_D %s - -AMOAND_D: --- -AMOAND_D-NEXT: mode: latency -AMOAND_D-NEXT: key: -AMOAND_D-NEXT: instructions: -AMOAND_D-NEXT: - 'AMOAND_D [[RE01:X[0-9]+]] X10 [[RE01:X[0-9]+]]' -AMOAND_D-NEXT: config: '' -AMOAND_D-NEXT: register_initial_values: -AMOAND_D-NEXT: - '[[RE01:X[0-9]+]]=0x0' -AMOAND_D-DAG: ... - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOADD_W -mattr="+a" | FileCheck --check-prefix=AMOADD_W %s - -AMOADD_W: --- -AMOADD_W-NEXT: mode: latency -AMOADD_W-NEXT: key: -AMOADD_W-NEXT: instructions: -AMOADD_W-NEXT: - 'AMOADD_W [[RE02:X[0-9]+]] X10 [[RE02:X[0-9]+]]' -AMOADD_W-NEXT: config: '' -AMOADD_W-NEXT: register_initial_values: -AMOADD_W-NEXT: - '[[RE02:X[0-9]+]]=0x0' -AMOADD_W-DAG: ... - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOMAXU_D -mattr="+a" | FileCheck --check-prefix=AMOMAXU_D %s - -AMOMAXU_D: --- -AMOMAXU_D-NEXT: mode: latency -AMOMAXU_D-NEXT: key: -AMOMAXU_D-NEXT: instructions: -AMOMAXU_D-NEXT: - 'AMOMAXU_D [[RE03:X[0-9]+]] X10 [[RE03:X[0-9]+]]' -AMOMAXU_D-NEXT: config: '' -AMOMAXU_D-NEXT: register_initial_values: -AMOMAXU_D-NEXT: - '[[RE03:X[0-9]+]]=0x0' -AMOMAXU_D-DAG: ... - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOMIN_W -mattr="+a" | FileCheck --check-prefix=AMOMIN_W %s - -AMOMIN_W: --- -AMOMIN_W-NEXT: mode: latency -AMOMIN_W-NEXT: key: -AMOMIN_W-NEXT: instructions: -AMOMIN_W-NEXT: - 'AMOMIN_W [[RE04:X[0-9]+]] X10 [[RE04:X[0-9]+]]' -AMOMIN_W-NEXT: config: '' -AMOMIN_W-NEXT: register_initial_values: -AMOMIN_W-NEXT: - '[[RE04:X[0-9]+]]=0x0' -AMOMIN_W-DAG: ... - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=AMOXOR_D -mattr="+a" | FileCheck --check-prefix=AMOXOR_D %s - -AMOXOR_D: --- -AMOXOR_D-NEXT: mode: latency -AMOXOR_D-NEXT: key: -AMOXOR_D-NEXT: instructions: -AMOXOR_D-NEXT: - 'AMOXOR_D [[RE05:X[0-9]+]] X10 [[RE05:X[0-9]+]]' -AMOXOR_D-NEXT: config: '' -AMOXOR_D-NEXT: register_initial_values: -AMOXOR_D-NEXT: - '[[RE05:X[0-9]+]]=0x0' -AMOXOR_D-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s deleted file mode 100644 index 9e94f024ed1162..00000000000000 --- a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-extension-C.s +++ /dev/null @@ -1,48 +0,0 @@ -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_ADDI -mattr=+c | FileCheck --check-prefix=C_ADDI %s - -C_ADDI: --- -C_ADDI-NEXT: mode: latency -C_ADDI-NEXT: key: -C_ADDI-NEXT: instructions: -C_ADDI-NEXT: - 'C_ADDI [[REG01:X[0-9]+]] [[RE02:X[0-9]+]] [[IMM0:i_0x[0-9]+]]' - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_ADDIW -mattr=+c | FileCheck --check-prefix=C_ADDIW %s - -C_ADDIW: --- -C_ADDIW-NEXT: mode: latency -C_ADDIW-NEXT: key: -C_ADDIW-NEXT: instructions: -C_ADDIW-NEXT: - 'C_ADDIW [[REG11:X[0-9]+]] [[RE12:X[0-9]+]] [[IMM1:i_0x[0-9]+]]' - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_ANDI -mattr=+c | FileCheck --check-prefix=C_ANDI %s - -C_ANDI: --- -C_ANDI-NEXT: mode: latency -C_ANDI-NEXT: key: -C_ANDI-NEXT: instructions: -C_ANDI-NEXT: - 'C_ANDI [[REG31:X[0-9]+]] [[REG32:X[0-9]+]] [[IMM3:i_0x[0-9]+]]' - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_SLLI -mattr=+c | FileCheck --check-prefix=C_SLLI %s - -C_SLLI: --- -C_SLLI-NEXT: mode: latency -C_SLLI-NEXT: key: -C_SLLI-NEXT: instructions: -C_SLLI-NEXT: - 'C_SLLI [[REG81:X[0-9]+]] [[REG82:X[0-9]+]] [[IMM8:i_0x[0-9]+]]' - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_SRAI -mattr=+c | FileCheck --check-prefix=C_SRAI %s - -C_SRAI: --- -C_SRAI-NEXT: mode: latency -C_SRAI-NEXT: key: -C_SRAI-NEXT: instructions: -C_SRAI-NEXT: - 'C_SRAI [[REG91:X[0-9]+]] [[REG92:X[0-9]+]] [[IMM9:i_0x[0-9]+]]' - -# RUN: llvm-exegesis -mode=latency -mtriple=riscv64-unknown-linux-gnu --mcpu=generic --benchmark-phase=assemble-measured-code -opcode-name=C_SRLI -mattr=+c | FileCheck --check-prefix=C_SRLI %s - -C_SRLI: --- -C_SRLI-NEXT: mode: latency -C_SRLI-NEXT: key: -C_SRLI-NEXT: instructions: -C_SRLI-NEXT: - 'C_SRLI [[REG101:X[0-9]+]] [[REG102:X[0-9]+]] [[IMM10:i_0x[0-9]+]]' -C_SRLI-DAG: ... diff --git a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s b/llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s deleted file mode 100644 index 2dea89cca4d7e9..00000000000000 --- a/llvm/test/tools/llvm-exegesis/RISCV/latency-by-opcode-name-FADD_D.s +++ /dev/null @@ -1,11 +0,0 @@ -# RUN: llvm-exegesis -mtriple=riscv64-unknown-linux-gnu --mcpu=generic -mode=latency --benchmark-phase=assemble-measured-code -mattr=+d -opcode-name=FADD_D | FileCheck %s - -CHECK: --- -CHECK-NEXT: mode: latency -CHECK-NEXT: key: -CHECK-NEXT: instructions: -CHECK-NEXT: - 'FADD_D [[REG1:F[0-9]+_D]] [[REG2:F[0-9]+_D]] [[REG3:F[0-9]+_D]] i_0x7' -CHECK-NEXT: config: '' -CHECK-NEXT: register_initial_values: -CHECK-DAG: - '[[REG1]]=0x0' -CHECK-DAG: ... diff --git a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt index d95c37ff5426bd..414b49e5e021c2 100644 --- a/llvm/tools/llvm-exegesis/lib/CMakeLists.txt +++ b/llvm/tools/llvm-exegesis/lib/CMakeLists.txt @@ -12,9 +12,6 @@ endif() if (LLVM_TARGETS_TO_BUILD MATCHES "Mips") list(APPEND LLVM_EXEGESIS_TARGETS "Mips") endif() -if(LLVM_TARGETS_TO_BUILD MATCHES "RISCV") - list(APPEND LLVM_EXEGESIS_TARGETS "RISCV") -endif() set(LLVM_EXEGESIS_TARGETS ${LLVM_EXEGESIS_TARGETS} PARENT_SCOPE) diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp index c9225e51213e59..9c926d1fc61124 100644 --- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp +++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.cpp @@ -95,12 +95,11 @@ Instruction::Instruction(const MCInstrDesc *Description, StringRef Name, const BitVector *ImplDefRegs, const BitVector *ImplUseRegs, const BitVector *AllDefRegs, - const BitVector *AllUseRegs, - const BitVector *NonMemoryRegs) + const BitVector *AllUseRegs) : Description(*Description), Name(Name), Operands(std::move(Operands)), Variables(std::move(Variables)), ImplDefRegs(*ImplDefRegs), ImplUseRegs(*ImplUseRegs), AllDefRegs(*AllDefRegs), - AllUseRegs(*AllUseRegs), NonMemoryRegs(*NonMemoryRegs) {} + AllUseRegs(*AllUseRegs) {} std::unique_ptr Instruction::create(const MCInstrInfo &InstrInfo, @@ -167,8 +166,6 @@ Instruction::create(const MCInstrInfo &InstrInfo, BitVector ImplUseRegs = RATC.emptyRegisters(); BitVector AllDefRegs = RATC.emptyRegisters(); BitVector AllUseRegs = RATC.emptyRegisters(); - BitVector NonMemoryRegs = RATC.emptyRegisters(); - for (const auto &Op : Operands) { if (Op.isReg()) { const auto &AliasingBits = Op.getRegisterAliasing().aliasedBits(); @@ -180,8 +177,6 @@ Instruction::create(const MCInstrInfo &InstrInfo, ImplDefRegs |= AliasingBits; if (Op.isUse() && Op.isImplicit()) ImplUseRegs |= AliasingBits; - if (Op.isUse() && !Op.isMemory()) - NonMemoryRegs |= AliasingBits; } } // Can't use make_unique because constructor is private. @@ -190,8 +185,7 @@ Instruction::create(const MCInstrInfo &InstrInfo, std::move(Variables), BVC.getUnique(std::move(ImplDefRegs)), BVC.getUnique(std::move(ImplUseRegs)), BVC.getUnique(std::move(AllDefRegs)), - BVC.getUnique(std::move(AllUseRegs)), - BVC.getUnique(std::move(NonMemoryRegs)))); + BVC.getUnique(std::move(AllUseRegs)))); } const Operand &Instruction::getPrimaryOperand(const Variable &Var) const { @@ -246,12 +240,6 @@ bool Instruction::hasAliasingRegisters( ForbiddenRegisters); } -bool Instruction::hasAliasingNotMemoryRegisters( - const BitVector &ForbiddenRegisters) const { - return anyCommonExcludingForbidden(AllDefRegs, NonMemoryRegs, - ForbiddenRegisters); -} - bool Instruction::hasOneUseOrOneDef() const { return AllDefRegs.count() || AllUseRegs.count(); } diff --git a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h index d7712e21c32c1c..f8ebc07d01f35e 100644 --- a/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h +++ b/llvm/tools/llvm-exegesis/lib/MCInstrDescView.h @@ -133,12 +133,6 @@ struct Instruction { // aliasing Use and Def registers. bool hasAliasingRegisters(const BitVector &ForbiddenRegisters) const; - // Whether this instruction is self aliasing through some registers. - // Repeating this instruction may execute sequentially by picking aliasing - // Def and Not Memory Use registers. It may also execute in parallel by - // picking non aliasing Def and Not Memory Use registers. - bool hasAliasingNotMemoryRegisters(const BitVector &ForbiddenRegisters) const; - // Whether this instruction's registers alias with OtherInstr's registers. bool hasAliasingRegistersThrough(const Instruction &OtherInstr, const BitVector &ForbiddenRegisters) const; @@ -166,15 +160,12 @@ struct Instruction { const BitVector &ImplUseRegs; // The set of aliased implicit use registers. const BitVector &AllDefRegs; // The set of all aliased def registers. const BitVector &AllUseRegs; // The set of all aliased use registers. - // The set of all aliased not memory use registers. - const BitVector &NonMemoryRegs; - private: Instruction(const MCInstrDesc *Description, StringRef Name, SmallVector Operands, SmallVector Variables, const BitVector *ImplDefRegs, const BitVector *ImplUseRegs, const BitVector *AllDefRegs, - const BitVector *AllUseRegs, const BitVector *NonMemoryRegs); + const BitVector *AllUseRegs); }; // Instructions are expensive to instantiate. This class provides a cache of diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt b/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt deleted file mode 100644 index 489ac6d6e34b33..00000000000000 --- a/llvm/tools/llvm-exegesis/lib/RISCV/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -include_directories( - ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV - ${LLVM_BINARY_DIR}/lib/Target/RISCV -) - -set(LLVM_LINK_COMPONENTS - CodeGen - RISCV - Exegesis - Core - Support - ) - -add_llvm_library(LLVMExegesisRISCV - DISABLE_LLVM_LINK_LLVM_DYLIB - STATIC - Target.cpp - - DEPENDS - intrinsics_gen - RISCVCommonTableGen - ) diff --git a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp b/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp deleted file mode 100644 index 891818b625fe14..00000000000000 --- a/llvm/tools/llvm-exegesis/lib/RISCV/Target.cpp +++ /dev/null @@ -1,275 +0,0 @@ -//===-- Target.cpp ----------------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "../Target.h" - -#include "MCTargetDesc/RISCVBaseInfo.h" -#include "MCTargetDesc/RISCVMCTargetDesc.h" -#include "MCTargetDesc/RISCVMatInt.h" -#include "RISCVInstrInfo.h" - -// include computeAvailableFeatures and computeRequiredFeatures. -#define GET_AVAILABLE_OPCODE_CHECKER -#include "RISCVGenInstrInfo.inc" - -#include "llvm/CodeGen/MachineInstrBuilder.h" - -#include - -namespace llvm { -namespace exegesis { - -namespace { - -// Stores constant value to a general-purpose (integer) register. -static std::vector loadIntReg(const MCSubtargetInfo &STI, unsigned Reg, - const APInt &Value) { - SmallVector MCInstSeq; - std::vector MatIntInstrs; - MCRegister DestReg = Reg; - - RISCVMatInt::generateMCInstSeq(Value.getSExtValue(), STI, DestReg, MCInstSeq); - MatIntInstrs.resize(MCInstSeq.size()); - std::copy(MCInstSeq.begin(), MCInstSeq.end(), MatIntInstrs.begin()); - - return MatIntInstrs; -} - -const unsigned ScratchIntReg = RISCV::X30; // t5 - -// Stores constant bits to a floating-point register. -static std::vector loadFPRegBits(const MCSubtargetInfo &STI, - unsigned Reg, const APInt &Bits, - unsigned FmvOpcode) { - std::vector Instrs = loadIntReg(STI, ScratchIntReg, Bits); - Instrs.push_back(MCInstBuilder(FmvOpcode).addReg(Reg).addReg(ScratchIntReg)); - return Instrs; -} - -// main idea is: -// we support APInt only if (represented as double) it has zero fractional -// part: 1.0, 2.0, 3.0, etc... then we can do the trick: write int to tmp reg t5 -// and then do FCVT this is only reliable thing in 32-bit mode, otherwise we -// need to use __floatsidf -static std::vector loadFP64RegBits32(const MCSubtargetInfo &STI, - unsigned Reg, const APInt &Bits) { - double D = Bits.bitsToDouble(); - double IPart; - double FPart = std::modf(D, &IPart); - - if (std::abs(FPart) > std::numeric_limits::epsilon()) { - errs() << "loadFP64RegBits32 is not implemented for doubles like " << D - << ", please remove fractional part\n"; - return {}; - } - - std::vector Instrs = loadIntReg(STI, ScratchIntReg, Bits); - Instrs.push_back( - MCInstBuilder(RISCV::FCVT_D_W).addReg(Reg).addReg(ScratchIntReg)); - return Instrs; -} - -static MCInst nop() { - // ADDI X0, X0, 0 - return MCInstBuilder(RISCV::ADDI) - .addReg(RISCV::X0) - .addReg(RISCV::X0) - .addImm(0); -} - -static bool isVectorRegList(unsigned Reg) { - return RISCV::VRM2RegClass.contains(Reg) || - RISCV::VRM4RegClass.contains(Reg) || - RISCV::VRM8RegClass.contains(Reg) || - RISCV::VRN2M1RegClass.contains(Reg) || - RISCV::VRN2M2RegClass.contains(Reg) || - RISCV::VRN2M4RegClass.contains(Reg) || - RISCV::VRN3M1RegClass.contains(Reg) || - RISCV::VRN3M2RegClass.contains(Reg) || - RISCV::VRN4M1RegClass.contains(Reg) || - RISCV::VRN4M2RegClass.contains(Reg) || - RISCV::VRN5M1RegClass.contains(Reg) || - RISCV::VRN6M1RegClass.contains(Reg) || - RISCV::VRN7M1RegClass.contains(Reg) || - RISCV::VRN8M1RegClass.contains(Reg); -} - -class ExegesisRISCVTarget : public ExegesisTarget { -public: - ExegesisRISCVTarget(); - - bool matchesArch(Triple::ArchType Arch) const override; - - std::vector setRegTo(const MCSubtargetInfo &STI, unsigned Reg, - const APInt &Value) const override; - - unsigned getDefaultLoopCounterRegister(const Triple &) const override; - - void decrementLoopCounterAndJump(MachineBasicBlock &MBB, - MachineBasicBlock &TargetMBB, - const MCInstrInfo &MII, - unsigned LoopRegister) const override; - - unsigned getScratchMemoryRegister(const Triple &TT) const override; - - void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, - unsigned Offset) const override; - - ArrayRef getUnavailableRegisters() const override; - - Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var, - MCOperand &AssignedValue, - const BitVector &ForbiddenRegs) const override; - - std::vector - generateInstructionVariants(const Instruction &Instr, - unsigned MaxConfigsPerOpcode) const override; -}; - -ExegesisRISCVTarget::ExegesisRISCVTarget() - : ExegesisTarget(ArrayRef{}, - RISCV_MC::isOpcodeAvailable) {} - -#define GET_REGISTER_MATCHER -#include "RISCVGenAsmMatcher.inc" - -bool ExegesisRISCVTarget::matchesArch(Triple::ArchType Arch) const { - return Arch == Triple::riscv32 || Arch == Triple::riscv64; -} - -std::vector ExegesisRISCVTarget::setRegTo(const MCSubtargetInfo &STI, - unsigned Reg, - const APInt &Value) const { - if (RISCV::GPRRegClass.contains(Reg)) - return loadIntReg(STI, Reg, Value); - if (RISCV::FPR16RegClass.contains(Reg)) - return loadFPRegBits(STI, Reg, Value, RISCV::FMV_H_X); - if (RISCV::FPR32RegClass.contains(Reg)) - return loadFPRegBits(STI, Reg, Value, RISCV::FMV_W_X); - if (RISCV::FPR64RegClass.contains(Reg)) { - if (STI.hasFeature(RISCV::Feature64Bit)) - return loadFPRegBits(STI, Reg, Value, RISCV::FMV_D_X); - return loadFP64RegBits32(STI, Reg, Value); - } - if (Reg == RISCV::FRM || Reg == RISCV::VL || Reg == RISCV::VLENB || - Reg == RISCV::VTYPE || RISCV::GPRPairRegClass.contains(Reg) || - RISCV::VRRegClass.contains(Reg) || isVectorRegList(Reg)) { - // Don't initialize: - // - FRM - // - VL, VLENB, VTYPE - // - vector registers (and vector register lists) - // - Zfinx registers - // Generate 'NOP' so that exegesis treats such registers as initialized - // (it tries to initialize them with '0' anyway). - return {nop()}; - } - errs() << "setRegTo is not implemented for Reg " << Reg - << ", results will be unreliable\n"; - return {}; -} - -const unsigned DefaultLoopCounterReg = RISCV::X31; // t6 -const unsigned ScratchMemoryReg = RISCV::X10; // a0 - -unsigned -ExegesisRISCVTarget::getDefaultLoopCounterRegister(const Triple &) const { - return DefaultLoopCounterReg; -} - -void ExegesisRISCVTarget::decrementLoopCounterAndJump( - MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, - const MCInstrInfo &MII, unsigned LoopRegister) const { - BuildMI(&MBB, DebugLoc(), MII.get(RISCV::ADDI)) - .addDef(LoopRegister) - .addUse(LoopRegister) - .addImm(-1); - BuildMI(&MBB, DebugLoc(), MII.get(RISCV::BNE)) - .addUse(LoopRegister) - .addUse(RISCV::X0) - .addMBB(&TargetMBB); -} - -unsigned ExegesisRISCVTarget::getScratchMemoryRegister(const Triple &TT) const { - return ScratchMemoryReg; // a0 -} - -void ExegesisRISCVTarget::fillMemoryOperands(InstructionTemplate &IT, - unsigned Reg, - unsigned Offset) const { - // TODO: for now we ignore Offset because have no way - // to detect it in instruction. - auto &I = IT.getInstr(); - - auto MemOpIt = - find_if(I.Operands, [](const Operand &Op) { return Op.isMemory(); }); - assert(MemOpIt != I.Operands.end() && - "Instruction must have memory operands"); - - const Operand &MemOp = *MemOpIt; - - assert(MemOp.isReg() && "Memory operand expected to be register"); - - IT.getValueFor(MemOp) = MCOperand::createReg(Reg); -} - -const unsigned UnavailableRegisters[4] = {RISCV::X0, DefaultLoopCounterReg, - ScratchIntReg, ScratchMemoryReg}; - -ArrayRef ExegesisRISCVTarget::getUnavailableRegisters() const { - return UnavailableRegisters; -} - -Error ExegesisRISCVTarget::randomizeTargetMCOperand( - const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue, - const BitVector &ForbiddenRegs) const { - uint8_t OperandType = - Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType; - - switch (OperandType) { - case RISCVOp::OPERAND_FRMARG: - AssignedValue = MCOperand::createImm(RISCVFPRndMode::DYN); - break; - case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO: - AssignedValue = MCOperand::createImm(0b1 << 4); - break; - case RISCVOp::OPERAND_SIMM6_NONZERO: - case RISCVOp::OPERAND_UIMMLOG2XLEN_NONZERO: - AssignedValue = MCOperand::createImm(1); - break; - default: - if (OperandType >= RISCVOp::OPERAND_FIRST_RISCV_IMM && - OperandType <= RISCVOp::OPERAND_LAST_RISCV_IMM) - AssignedValue = MCOperand::createImm(0); - } - return Error::success(); -} - -std::vector -ExegesisRISCVTarget::generateInstructionVariants( - const Instruction &Instr, unsigned int MaxConfigsPerOpcode) const { - InstructionTemplate IT{&Instr}; - for (const Operand &Op : Instr.Operands) - if (Op.isMemory()) { - IT.getValueFor(Op) = MCOperand::createReg(ScratchMemoryReg); - } - return {IT}; -} - -} // anonymous namespace - -static ExegesisTarget *getTheRISCVExegesisTarget() { - static ExegesisRISCVTarget Target; - return &Target; -} - -void InitializeRISCVExegesisTarget() { - ExegesisTarget::registerTarget(getTheRISCVExegesisTarget()); -} - -} // namespace exegesis -} // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp index 25cdf1ce66d449..7100b51bbb7298 100644 --- a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp @@ -53,6 +53,13 @@ computeAliasingInstructions(const LLVMState &State, const Instruction *Instr, if (OtherOpcode == Instr->Description.getOpcode()) continue; const Instruction &OtherInstr = State.getIC().getInstr(OtherOpcode); + const MCInstrDesc &OtherInstrDesc = OtherInstr.Description; + // Ignore instructions that we cannot run. + if (OtherInstrDesc.isPseudo() || OtherInstrDesc.usesCustomInsertionHook() || + OtherInstrDesc.isBranch() || OtherInstrDesc.isIndirectBranch() || + OtherInstrDesc.isCall() || OtherInstrDesc.isReturn()) { + continue; + } if (OtherInstr.hasMemoryOperands()) continue; if (!ET.allowAsBackToBack(OtherInstr)) @@ -74,10 +81,12 @@ static ExecutionMode getExecutionModes(const Instruction &Instr, EM |= ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS; if (Instr.hasMemoryOperands()) EM |= ExecutionMode::SERIAL_VIA_MEMORY_INSTR; - if (Instr.hasAliasingNotMemoryRegisters(ForbiddenRegisters)) - EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS; - if (Instr.hasOneUseOrOneDef()) - EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR; + else { + if (Instr.hasAliasingRegisters(ForbiddenRegisters)) + EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS; + if (Instr.hasOneUseOrOneDef()) + EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR; + } return EM; } diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp index 282bc8ca912492..b37999ab017f59 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp @@ -37,10 +37,10 @@ namespace { // An MCStreamer that reads a BenchmarkCode definition from a file. class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer { public: - explicit BenchmarkCodeStreamer(const ExegesisTarget &Target, - MCContext *Context, const LLVMState &State, + explicit BenchmarkCodeStreamer(MCContext *Context, const LLVMState &State, BenchmarkCode *Result) - : MCStreamer(*Context), Target(Target), State(State), Result(Result) {} + : MCStreamer(*Context), State(State), Result(Result) {} + // Implementation of the MCStreamer interface. We only care about // instructions. void emitInstruction(const MCInst &Instruction, @@ -218,7 +218,6 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer { return *RegisterNumber; } - const ExegesisTarget &Target; const LLVMState &State; BenchmarkCode *const Result; unsigned InvalidComments = 0; @@ -252,8 +251,7 @@ Expected> readSnippets(const LLVMState &State, TM.getTarget().createMCObjectFileInfo(Context, /*PIC=*/false)); Context.setObjectFileInfo(ObjectFileInfo.get()); Context.initInlineSourceManager(); - BenchmarkCodeStreamer Streamer(State.getExegesisTarget(), &Context, State, - &Result); + BenchmarkCodeStreamer Streamer(&Context, State, &Result); std::string Error; raw_string_ostream ErrorStream(Error); diff --git a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp index 48357d443f713e..7dcff60a8fd11f 100644 --- a/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/llvm/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -73,9 +73,6 @@ Error SnippetGenerator::generateConfigurations( for (CodeTemplate &CT : Templates) { // TODO: Generate as many BenchmarkCode as needed. { - CT.ScratchSpacePointerInReg = - State.getExegesisTarget().getScratchMemoryRegister( - State.getTargetMachine().getTargetTriple()); BenchmarkCode BC; BC.Info = CT.Info; BC.Key.Instructions.reserve(CT.Instructions.size()); @@ -111,12 +108,6 @@ std::vector SnippetGenerator::computeRegisterInitialValues( // Loop invariant: DefinedRegs[i] is true iif it has been set at least once // before the current instruction. BitVector DefinedRegs = State.getRATC().emptyRegisters(); - // If target always expects a scratch memory register as live input, - // mark it as defined. - const ExegesisTarget &Target = State.getExegesisTarget(); - unsigned ScratchMemoryReg = Target.getScratchMemoryRegister( - State.getTargetMachine().getTargetTriple()); - DefinedRegs.set(ScratchMemoryReg); std::vector RIV; for (const InstructionTemplate &IT : Instructions) { // Returns the register that this Operand sets or uses, or 0 if this is not @@ -209,8 +200,7 @@ static void setRegisterOperandValue(const RegisterOperandAssignment &ROV, if (ROV.Op->isExplicit()) { auto &AssignedValue = IB.getValueFor(*ROV.Op); if (AssignedValue.isValid()) { - // TODO don't re-assign register operands which are already "locked" - // by Target in corresponding InstructionTemplate + assert(AssignedValue.isReg() && AssignedValue.getReg() == ROV.Reg); return; } AssignedValue = MCOperand::createReg(ROV.Reg); diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index fa37e05956be8c..546ec770a8d221 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -274,10 +274,6 @@ static cl::opt BenchmarkProcessCPU( cl::desc("The CPU number that the benchmarking process should executon on"), cl::cat(BenchmarkOptions), cl::init(-1)); -static cl::opt MAttr( - "mattr", cl::desc("comma-separated list of target architecture features"), - cl::value_desc("+feature1,-feature2,..."), cl::cat(Options), cl::init("")); - static ExitOnError ExitOnErr("llvm-exegesis error: "); // Helper function that logs the error(s) and exits. @@ -300,18 +296,6 @@ T ExitOnFileError(const Twine &FileName, Expected &&E) { return std::move(*E); } -static const char *getIgnoredOpcodeReasonOrNull(const LLVMState &State, - unsigned Opcode) { - const MCInstrDesc &InstrDesc = State.getIC().getInstr(Opcode).Description; - if (InstrDesc.isPseudo() || InstrDesc.usesCustomInsertionHook()) - return "Unsupported opcode: isPseudo/usesCustomInserter"; - if (InstrDesc.isBranch() || InstrDesc.isIndirectBranch()) - return "Unsupported opcode: isBranch/isIndirectBranch"; - if (InstrDesc.isCall() || InstrDesc.isReturn()) - return "Unsupported opcode: isCall/isReturn"; - return nullptr; -} - // Checks that only one of OpcodeNames, OpcodeIndex or SnippetsFile is provided, // and returns the opcode indices or {} if snippets should be read from // `SnippetsFile`. @@ -350,7 +334,6 @@ static std::vector getOpcodesOrDie(const LLVMState &State) { return I->getSecond(); return 0u; }; - SmallVector Pieces; StringRef(OpcodeNames.getValue()) .split(Pieces, ",", /* MaxSplit */ -1, /* KeepEmpty */ false); @@ -369,11 +352,17 @@ static std::vector getOpcodesOrDie(const LLVMState &State) { static Expected> generateSnippets(const LLVMState &State, unsigned Opcode, const BitVector &ForbiddenRegs) { + const Instruction &Instr = State.getIC().getInstr(Opcode); + const MCInstrDesc &InstrDesc = Instr.Description; // Ignore instructions that we cannot run. - if (const char *Reason = getIgnoredOpcodeReasonOrNull(State, Opcode)) - return make_error(Reason); + if (InstrDesc.isPseudo() || InstrDesc.usesCustomInsertionHook()) + return make_error( + "Unsupported opcode: isPseudo/usesCustomInserter"); + if (InstrDesc.isBranch() || InstrDesc.isIndirectBranch()) + return make_error("Unsupported opcode: isBranch/isIndirectBranch"); + if (InstrDesc.isCall() || InstrDesc.isReturn()) + return make_error("Unsupported opcode: isCall/isReturn"); - const Instruction &Instr = State.getIC().getInstr(Opcode); const std::vector InstructionVariants = State.getExegesisTarget().generateInstructionVariants( Instr, MaxConfigsPerOpcode); @@ -496,8 +485,8 @@ void benchmarkMain() { LLVMInitialize##TargetName##AsmParser(); #include "llvm/Config/TargetExegesis.def" - const LLVMState State = ExitOnErr( - LLVMState::Create(TripleName, MCPU, MAttr, UseDummyPerfCounters)); + const LLVMState State = + ExitOnErr(LLVMState::Create(TripleName, MCPU, "", UseDummyPerfCounters)); // Preliminary check to ensure features needed for requested // benchmark mode are present on target CPU and/or OS. From c6967efe780d6cc5d70fc8cadbd227353b6768f1 Mon Sep 17 00:00:00 2001 From: Andrei Safronov Date: Wed, 18 Dec 2024 15:37:08 +0300 Subject: [PATCH 25/37] [Xtensa] Implement Code Density Option. (#119639) The Code Density option adds 16-bit encoding for frequently used instructions. --- .../Xtensa/AsmParser/XtensaAsmParser.cpp | 11 ++ .../Disassembler/XtensaDisassembler.cpp | 81 ++++++++++++-- .../Xtensa/MCTargetDesc/XtensaAsmBackend.cpp | 4 +- .../Xtensa/MCTargetDesc/XtensaInstPrinter.cpp | 22 ++++ .../Xtensa/MCTargetDesc/XtensaInstPrinter.h | 2 + .../MCTargetDesc/XtensaMCCodeEmitter.cpp | 55 +++++++++- llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp | 9 +- llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 7 +- llvm/lib/Target/Xtensa/XtensaInstrInfo.td | 101 ++++++++++++++++++ llvm/lib/Target/Xtensa/XtensaOperands.td | 14 +++ .../MC/Disassembler/Xtensa/code_density.txt | 64 +++++++++++ .../test/MC/Disassembler/Xtensa/lit.local.cfg | 2 + llvm/test/MC/Xtensa/Relocations/fixups.s | 23 ++-- llvm/test/MC/Xtensa/Relocations/relocations.s | 12 ++- llvm/test/MC/Xtensa/code_density-invalid.s | 21 ++++ llvm/test/MC/Xtensa/code_density.s | 68 ++++++++++++ 16 files changed, 471 insertions(+), 25 deletions(-) create mode 100644 llvm/test/MC/Disassembler/Xtensa/code_density.txt create mode 100644 llvm/test/MC/Disassembler/Xtensa/lit.local.cfg create mode 100644 llvm/test/MC/Xtensa/code_density-invalid.s create mode 100644 llvm/test/MC/Xtensa/code_density.s diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp index 83b1cfca529bf3..731f9535ca251f 100644 --- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp +++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp @@ -193,6 +193,11 @@ struct XtensaOperand : public MCParsedAsmOperand { bool isImm1_16() const { return isImm(1, 16); } + // Check that value is either equals (-1) or from [1,15] range. + bool isImm1n_15() const { return isImm(1, 15) || isImm(-1, -1); } + + bool isImm32n_95() const { return isImm(-32, 95); } + bool isB4const() const { if (Kind != Immediate) return false; @@ -480,6 +485,12 @@ bool XtensaAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidImm1_16: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range [1, 16]"); + case Match_InvalidImm1n_15: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range [-1, 15] except 0"); + case Match_InvalidImm32n_95: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range [-32, 95]"); case Match_InvalidShimm1_31: return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected immediate in range [1, 31]"); diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp index 2d36b94dd40c77..c11c4b7038bdb7 100644 --- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp +++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp @@ -38,9 +38,7 @@ class XtensaDisassembler : public MCDisassembler { XtensaDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool isLE) : MCDisassembler(STI, Ctx), IsLittleEndian(isLE) {} - bool hasDensity() const { - return STI.hasFeature(Xtensa::FeatureDensity); - } + bool hasDensity() const { return STI.hasFeature(Xtensa::FeatureDensity); } DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, @@ -99,8 +97,8 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, uint64_t InstSize, MCInst &MI, const void *Decoder) { const MCDisassembler *Dis = static_cast(Decoder); - return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset, /*OpSize=*/0, - InstSize); + return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset, + /*OpSize=*/0, InstSize); } static DecodeStatus decodeCallOperand(MCInst &Inst, uint64_t Imm, @@ -190,6 +188,28 @@ static DecodeStatus decodeImm1_16Operand(MCInst &Inst, uint64_t Imm, return MCDisassembler::Success; } +static DecodeStatus decodeImm1n_15Operand(MCInst &Inst, uint64_t Imm, + int64_t Address, + const void *Decoder) { + assert(isUInt<4>(Imm) && "Invalid immediate"); + if (!Imm) + Inst.addOperand(MCOperand::createImm(-1)); + else + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} + +static DecodeStatus decodeImm32n_95Operand(MCInst &Inst, uint64_t Imm, + int64_t Address, + const void *Decoder) { + assert(isUInt<7>(Imm) && "Invalid immediate"); + if ((Imm & 0x60) == 0x60) + Inst.addOperand(MCOperand::createImm((~0x1f) | Imm)); + else + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} + static DecodeStatus decodeShimm1_31Operand(MCInst &Inst, uint64_t Imm, int64_t Address, const void *Decoder) { @@ -243,9 +263,37 @@ static DecodeStatus decodeMem32Operand(MCInst &Inst, uint64_t Imm, return MCDisassembler::Success; } +static DecodeStatus decodeMem32nOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + assert(isUInt<8>(Imm) && "Invalid immediate"); + DecodeARRegisterClass(Inst, Imm & 0xf, Address, Decoder); + Inst.addOperand(MCOperand::createImm((Imm >> 2) & 0x3c)); + return MCDisassembler::Success; +} + +/// Read two bytes from the ArrayRef and return 16 bit data sorted +/// according to the given endianness. +static DecodeStatus readInstruction16(ArrayRef Bytes, uint64_t Address, + uint64_t &Size, uint64_t &Insn, + bool IsLittleEndian) { + // We want to read exactly 2 Bytes of data. + if (Bytes.size() < 2) { + Size = 0; + return MCDisassembler::Fail; + } + + if (!IsLittleEndian) { + report_fatal_error("Big-endian mode currently is not supported!"); + } else { + Insn = (Bytes[1] << 8) | Bytes[0]; + } + + return MCDisassembler::Success; +} + /// Read three bytes from the ArrayRef and return 24 bit data static DecodeStatus readInstruction24(ArrayRef Bytes, uint64_t Address, - uint64_t &Size, uint32_t &Insn, + uint64_t &Size, uint64_t &Insn, bool IsLittleEndian) { // We want to read exactly 3 Bytes of data. if (Bytes.size() < 3) { @@ -259,7 +307,6 @@ static DecodeStatus readInstruction24(ArrayRef Bytes, uint64_t Address, Insn = (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); } - Size = 3; return MCDisassembler::Success; } @@ -269,13 +316,31 @@ DecodeStatus XtensaDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes, uint64_t Address, raw_ostream &CS) const { - uint32_t Insn; + uint64_t Insn; DecodeStatus Result; + // Parse 16-bit instructions + if (hasDensity()) { + Result = readInstruction16(Bytes, Address, Size, Insn, IsLittleEndian); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + LLVM_DEBUG(dbgs() << "Trying Xtensa 16-bit instruction table :\n"); + Result = decodeInstruction(DecoderTable16, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + return Result; + } + } + + // Parse Core 24-bit instructions Result = readInstruction24(Bytes, Address, Size, Insn, IsLittleEndian); if (Result == MCDisassembler::Fail) return MCDisassembler::Fail; LLVM_DEBUG(dbgs() << "Trying Xtensa 24-bit instruction table :\n"); Result = decodeInstruction(DecoderTable24, MI, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 3; + return Result; + } return Result; } diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp index a296a22247a5c0..c1fb46e69e6fbe 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp @@ -88,8 +88,10 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case FK_Data_8: return Value; case Xtensa::fixup_xtensa_branch_6: { + if (!Value) + return 0; Value -= 4; - if (!isInt<6>(Value)) + if (!isUInt<6>(Value)) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); unsigned Hi2 = (Value >> 4) & 0x3; unsigned Lo4 = Value & 0xf; diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp index e04d7bd211216f..df8a0854f06f41 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp @@ -242,6 +242,28 @@ void XtensaInstPrinter::printImm1_16_AsmOperand(const MCInst *MI, int OpNum, printOperand(MI, OpNum, O); } +void XtensaInstPrinter::printImm1n_15_AsmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + if (MI->getOperand(OpNum).isImm()) { + int64_t Value = MI->getOperand(OpNum).getImm(); + assert((Value >= -1 && (Value != 0) && Value <= 15) && + "Invalid argument, value must be in ranges <-1,-1> or <1,15>"); + O << Value; + } else + printOperand(MI, OpNum, O); +} + +void XtensaInstPrinter::printImm32n_95_AsmOperand(const MCInst *MI, int OpNum, + raw_ostream &O) { + if (MI->getOperand(OpNum).isImm()) { + int64_t Value = MI->getOperand(OpNum).getImm(); + assert((Value >= -32 && Value <= 95) && + "Invalid argument, value must be in ranges <-32,95>"); + O << Value; + } else + printOperand(MI, OpNum, O); +} + void XtensaInstPrinter::printOffset8m8_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { if (MI->getOperand(OpNum).isImm()) { diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h index f56d5d1458dc11..e5bc67869e103d 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.h @@ -58,6 +58,8 @@ class XtensaInstPrinter : public MCInstPrinter { void printUimm5_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); void printShimm1_31_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); void printImm1_16_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printImm1n_15_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); + void printImm32n_95_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); void printOffset8m8_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); void printOffset8m16_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); void printOffset8m32_AsmOperand(const MCInst *MI, int OpNum, raw_ostream &O); diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp index 1afdbb38f9571a..51d4b8a9cc5fc5 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCCodeEmitter.cpp @@ -103,6 +103,14 @@ class XtensaMCCodeEmitter : public MCCodeEmitter { SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + uint32_t getImm1n_15OpValue(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getImm32n_95OpValue(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getShimm1_31OpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -188,6 +196,11 @@ uint32_t XtensaMCCodeEmitter::getBranchTargetEncoding( Fixups.push_back(MCFixup::create( 0, Expr, MCFixupKind(Xtensa::fixup_xtensa_branch_12), MI.getLoc())); return 0; + case Xtensa::BEQZ_N: + case Xtensa::BNEZ_N: + Fixups.push_back(MCFixup::create( + 0, Expr, MCFixupKind(Xtensa::fixup_xtensa_branch_6), MI.getLoc())); + return 0; default: Fixups.push_back(MCFixup::create( 0, Expr, MCFixupKind(Xtensa::fixup_xtensa_branch_8), MI.getLoc())); @@ -255,14 +268,24 @@ XtensaMCCodeEmitter::getMemRegEncoding(const MCInst &MI, unsigned OpNo, break; case Xtensa::S32I: case Xtensa::L32I: + case Xtensa::S32I_N: + case Xtensa::L32I_N: if (Res & 0x3) { report_fatal_error("Unexpected operand value!"); } Res >>= 2; break; } - - assert((isUInt<8>(Res)) && "Unexpected operand value!"); + + switch (MI.getOpcode()) { + case Xtensa::S32I_N: + case Xtensa::L32I_N: + assert((isUInt<4>(Res)) && "Unexpected operand value!"); + break; + default: + assert((isUInt<8>(Res)) && "Unexpected operand value!"); + break; + } uint32_t OffBits = Res << 4; uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); @@ -354,6 +377,34 @@ XtensaMCCodeEmitter::getImm1_16OpValue(const MCInst &MI, unsigned OpNo, return (Res - 1); } +uint32_t +XtensaMCCodeEmitter::getImm1n_15OpValue(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + int32_t Res = static_cast(MO.getImm()); + + assert(((Res >= -1) && (Res <= 15) && (Res != 0)) && + "Unexpected operand value!"); + + if (Res < 0) + Res = 0; + + return Res; +} + +uint32_t +XtensaMCCodeEmitter::getImm32n_95OpValue(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + int32_t Res = static_cast(MO.getImm()); + + assert(((Res >= -32) && (Res <= 95)) && "Unexpected operand value!"); + + return Res; +} + uint32_t XtensaMCCodeEmitter::getB4constOpValue(const MCInst &MI, unsigned OpNo, SmallVectorImpl &Fixups, diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp index af1110487b4274..ef14095d18efbf 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp @@ -27,10 +27,17 @@ using namespace llvm; namespace { class XtensaDAGToDAGISel : public SelectionDAGISel { + const XtensaSubtarget *Subtarget = nullptr; + public: - XtensaDAGToDAGISel(XtensaTargetMachine &TM, CodeGenOptLevel OptLevel) + explicit XtensaDAGToDAGISel(XtensaTargetMachine &TM, CodeGenOptLevel OptLevel) : SelectionDAGISel(TM, OptLevel) {} + bool runOnMachineFunction(MachineFunction &MF) override { + Subtarget = &MF.getSubtarget(); + return SelectionDAGISel::runOnMachineFunction(MF); + } + void Select(SDNode *Node) override; bool SelectInlineAsmMemoryOperand(const SDValue &Op, diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index 7e43c03ee72cac..6dfda02b7622b8 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -506,7 +506,8 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Memcpy = DAG.getMemcpy( Chain, DL, Address, ArgValue, SizeNode, Flags.getNonZeroByValAlign(), /*isVolatile=*/false, /*AlwaysInline=*/false, - /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), MachinePointerInfo()); + /*CI=*/nullptr, std::nullopt, MachinePointerInfo(), + MachinePointerInfo()); MemOpChains.push_back(Memcpy); } else { assert(VA.isMemLoc() && "Argument not register or memory"); @@ -1319,10 +1320,12 @@ MachineBasicBlock *XtensaTargetLowering::EmitInstrWithCustomInserter( case Xtensa::S8I: case Xtensa::S16I: case Xtensa::S32I: + case Xtensa::S32I_N: case Xtensa::L8UI: case Xtensa::L16SI: case Xtensa::L16UI: - case Xtensa::L32I: { + case Xtensa::L32I: + case Xtensa::L32I_N: { // Insert memory wait instruction "memw" before volatile load/store as it is // implemented in gcc. If memoperands is empty then assume that it aslo // maybe volatile load/store and insert "memw". diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index e21de0448aa5ae..699d0d6cf80445 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -577,3 +577,104 @@ let usesCustomInserter = 1 in { "!select $dst, $lhs, $rhs, $t, $f, $cond", [(set i32:$dst, (Xtensa_select_cc i32:$lhs, i32:$rhs, i32:$t, i32:$f, imm:$cond))]>; } + +//===----------------------------------------------------------------------===// +// Code Density instructions +//===----------------------------------------------------------------------===// + +class ArithLogic_RRRN oper0, string instrAsm, + SDPatternOperator opNode, bit isComm = 0> + : RRRN_Inst, Requires<[HasDensity]> { + let isCommutable = isComm; + let isReMaterializable = 0; +} + +def ADD_N : ArithLogic_RRRN<0x0a, "add.n", add, 1>; + +def ADDI_N : RRRN_Inst<0x0B, (outs AR:$r), (ins AR:$s, imm1n_15:$imm), + "addi.n\t$r, $s, $imm", + [(set AR:$r, (add AR:$s, imm1n_15:$imm))]>, Requires<[HasDensity]> { + bits<4> imm; + + let t = imm; +} + +// Conditional branch instructions. +let isBranch = 1, isTerminator = 1 in { + def BEQZ_N : RI6_Inst<0xC, 0x1, 0x0, (outs), (ins AR:$s, brtarget:$target), + "beqz.n\t$s, $target", []>, Requires<[HasDensity]> { + bits<6> target; + + let imm6 = target; + } + + def BNEZ_N : RI6_Inst<0xC, 0x1, 0x1, (outs), (ins AR:$s, brtarget:$target), + "bnez.n\t$s, $target", []>, Requires<[HasDensity]> { + bits<6> target; + + let imm6 = target; + } +} + +def ILL_N : RRRN_Inst<0x0D, (outs), (ins), + "ill.n", []>, Requires<[HasDensity]> { + let r = 0xF; + let s = 0x0; + let t = 0x6; +} + +def MOV_N : RRRN_Inst<0x0D, (outs AR:$t), (ins AR:$s), + "mov.n\t$t, $s", []>, Requires<[HasDensity]> { + let r = 0; +} + +def : InstAlias<"mov\t $t, $s", (OR AR:$t, AR:$s, AR:$s)>; + +def MOVI_N : RI7_Inst<0xc, 0x0, (outs AR:$s), (ins imm32n_95:$imm7), + "movi.n\t$s, $imm7", + [(set AR:$s, imm32n_95:$imm7)]>, Requires<[HasDensity]>; + +def : InstAlias<"_movi.n\t$s, $imm7", (MOVI_N AR:$s, imm32n_95:$imm7)>; + +def NOP_N : RRRN_Inst<0x0D, (outs), (ins), + "nop.n", []>, Requires<[HasDensity]> { + let r = 0xF; + let s = 0x0; + let t = 0x3; +} + +// Load instruction +let mayLoad = 1, usesCustomInserter = 1 in { + def L32I_N : RRRN_Inst<0x8, (outs AR:$t), (ins mem32n:$addr), + "l32i.n\t$t, $addr", []>, Requires<[HasDensity]> { + bits<8> addr; + + let r{3-0} = addr{7-4}; + let s{3-0} = addr{3-0}; + } +} + +// Store instruction +let mayStore = 1, usesCustomInserter = 1 in { + def S32I_N : RRRN_Inst<0x9, (outs), (ins AR:$t, mem32n:$addr), + "s32i.n\t$t, $addr", []>, Requires<[HasDensity]> { + bits<8> addr; + + let r{3-0} = addr{7-4}; + let s{3-0} = addr{3-0}; + } +} + +//Return instruction +let isReturn = 1, isTerminator = 1, + isBarrier = 1, Uses = [A0] in { + def RET_N : RRRN_Inst<0x0D, (outs), (ins), + "ret.n", [(Xtensa_ret)]>, + Requires<[HasDensity]> { + let r = 0x0F; + let s = 0; + let t = 0; + } +} diff --git a/llvm/lib/Target/Xtensa/XtensaOperands.td b/llvm/lib/Target/Xtensa/XtensaOperands.td index f41081f9bf2f96..aa72fa0a56a6f5 100644 --- a/llvm/lib/Target/Xtensa/XtensaOperands.td +++ b/llvm/lib/Target/Xtensa/XtensaOperands.td @@ -72,6 +72,20 @@ def imm1_16 : Immediate= 1 && Imm <= 16; }], "Imm1_16_AsmOp let DecoderMethod = "decodeImm1_16Operand"; } +// imm1n_15 predicate - Immediate in the range [-1,15], except 0 +def Imm1n_15_AsmOperand: ImmAsmOperand<"Imm1n_15">; +def imm1n_15: Immediate= -1 && Imm <= 15 && Imm != 0; }], "Imm1n_15_AsmOperand"> { + let EncoderMethod = "getImm1n_15OpValue"; + let DecoderMethod = "decodeImm1n_15Operand"; +} + +// imm32n_95 predicate - Immediate in the range [-32,95] +def Imm32n_95_AsmOperand: ImmAsmOperand<"Imm32n_95">; +def imm32n_95: Immediate= -32 && Imm <= 95; }], "Imm32n_95_AsmOperand"> { + let EncoderMethod = "getImm32n_95OpValue"; + let DecoderMethod = "decodeImm32n_95Operand"; +} + // shimm1_31 predicate - Immediate in the range [1,31] def Shimm1_31_AsmOperand : ImmAsmOperand<"Shimm1_31">; def shimm1_31 : Immediate= 1 && Imm <= 31; }], "Shimm1_31_AsmOperand"> { diff --git a/llvm/test/MC/Disassembler/Xtensa/code_density.txt b/llvm/test/MC/Disassembler/Xtensa/code_density.txt new file mode 100644 index 00000000000000..b2c91bcfbaefec --- /dev/null +++ b/llvm/test/MC/Disassembler/Xtensa/code_density.txt @@ -0,0 +1,64 @@ +# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble < %s | FileCheck -check-prefixes=CHECK-DENSITY %s +# RUN: llvm-mc -triple=xtensa -disassemble %s &> %t +# RUN: FileCheck -check-prefixes=CHECK-CORE < %t %s + +#------------------------------------------------------------------------------ +# Verify that binary code is correctly disassembled with +# code density option enabled. Also verify that dissasembling without +# density option generates warnings. +#------------------------------------------------------------------------------ + +0x4a 0x23 +# CHECK-DENSITY: add.n a2, a3, a4 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x3b 0x23 +# CHECK-DENSITY: addi.n a2, a3, 3 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x9c 0x03 +# CHECK-DENSITY: beqz.n a3, . +20 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0xcc 0xe3 +# CHECK-DENSITY: bnez.n a3, . +18 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x6d 0xf0 +# CHECK-DENSITY: ill.n +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x28 0x33 +# CHECK-DENSITY: l32i.n a2, a3, 12 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x2d 0x03 +# CHECK-DENSITY: mov.n a2, a3 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x0d 0xf0 +# CHECK-DENSITY: ret.n +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x29 0x33 +# CHECK-DENSITY: s32i.n a2, a3, 12 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x6c 0x02 +# CHECK-DENSITY: movi.n a2, -32 +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding + +0x3d 0xf0 +# CHECK-DENSITY: nop.n +# CHECK-CORE: [[#@LINE-2]]:1: warning: invalid instruction encoding +# CHECK-CORE: [[#@LINE-3]]:6: warning: invalid instruction encoding diff --git a/llvm/test/MC/Disassembler/Xtensa/lit.local.cfg b/llvm/test/MC/Disassembler/Xtensa/lit.local.cfg new file mode 100644 index 00000000000000..e81bfa773f36a8 --- /dev/null +++ b/llvm/test/MC/Disassembler/Xtensa/lit.local.cfg @@ -0,0 +1,2 @@ +if not "Xtensa" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/MC/Xtensa/Relocations/fixups.s b/llvm/test/MC/Xtensa/Relocations/fixups.s index cd76f2a23322d8..0a3a9eeef1159c 100644 --- a/llvm/test/MC/Xtensa/Relocations/fixups.s +++ b/llvm/test/MC/Xtensa/Relocations/fixups.s @@ -1,7 +1,7 @@ -# RUN: llvm-mc -triple xtensa < %s -show-encoding \ +# RUN: llvm-mc -triple xtensa --mattr=+density < %s -show-encoding \ # RUN: | FileCheck -check-prefix=CHECK-FIXUP %s -# RUN: llvm-mc -filetype=obj -triple xtensa < %s \ -# RUN: | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INSTR %s +# RUN: llvm-mc -filetype=obj -triple xtensa --mattr=+density < %s \ +# RUN: | llvm-objdump --mattr=+density -d - | FileCheck -check-prefix=CHECK-INSTR %s # Checks that fixups that can be resolved within the same object file are @@ -11,9 +11,13 @@ LBL0: .fill 12 +beqz.n a2, LBL1 +# CHECK-FIXUP: fixup A - offset: 0, value: LBL1, kind: fixup_xtensa_branch_6 +# CHECK-INSTR: beqz.n a2, . +29 + beq a0, a1, LBL0 # CHECK-FIXUP: fixup A - offset: 0, value: LBL0, kind: fixup_xtensa_branch_8 -# CHECK-INSTR: beq a0, a1, . -12 +# CHECK-INSTR: beq a0, a1, . -14 beq a0, a1, LBL1 # CHECK-FIXUP: fixup A - offset: 0, value: LBL1, kind: fixup_xtensa_branch_8 @@ -21,7 +25,7 @@ beq a0, a1, LBL1 beqz a2, LBL0 # CHECK-FIXUP: fixup A - offset: 0, value: LBL0, kind: fixup_xtensa_branch_12 -# CHECK-INSTR: beqz a2, . -18 +# CHECK-INSTR: beqz a2, . -20 beqz a2, LBL1 # CHECK-FIXUP: fixup A - offset: 0, value: LBL1, kind: fixup_xtensa_branch_12 @@ -33,22 +37,23 @@ call0 LBL0 call0 LBL2 # CHECK-FIXUP: fixup A - offset: 0, value: LBL2, kind: fixup_xtensa_call_18 -# CHECK-INSTR: call0 . +2056 +# CHECK-INSTR: call0 . +2068 j LBL0 # CHECK-FIXUP: fixup A - offset: 0, value: LBL0, kind: fixup_xtensa_jump_18 -# CHECK-INSTR: j . -30 +# CHECK-INSTR: j . -32 j LBL2 # CHECK-FIXUP: fixup A - offset: 0, value: LBL2, kind: fixup_xtensa_jump_18 -# CHECK-INSTR: j . +2047 +# CHECK-INSTR: j . +2061 l32r a1, LBL0 # CHECK-FIXUP: fixup A - offset: 0, value: LBL0, kind: fixup_xtensa_l32r_16 -# CHECK-INSTR: l32r a1, . -36 +# CHECK-INSTR: l32r a1, . -38 LBL1: .fill 2041 +.align 4 LBL2: diff --git a/llvm/test/MC/Xtensa/Relocations/relocations.s b/llvm/test/MC/Xtensa/Relocations/relocations.s index 19c2e16352509d..339f6cb44bfcfd 100644 --- a/llvm/test/MC/Xtensa/Relocations/relocations.s +++ b/llvm/test/MC/Xtensa/Relocations/relocations.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -triple xtensa < %s -show-encoding \ +# RUN: llvm-mc -triple xtensa --mattr=+density < %s -show-encoding \ # RUN: | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s -# RUN: llvm-mc -filetype=obj -triple xtensa < %s \ +# RUN: llvm-mc -filetype=obj -triple xtensa --mattr=+density < %s \ # RUN: | llvm-readobj -r - | FileCheck -check-prefix=RELOC %s # Check prefixes: @@ -76,6 +76,14 @@ beqz a8, func # INST: beqz a8, func # FIXUP: fixup A - offset: 0, value: func, kind: fixup_xtensa_branch_12 +beqz.n a8, func +# INST: beqz.n a8, func +# FIXUP: fixup A - offset: 0, value: func, kind: fixup_xtensa_branch_6 + +bnez.n a8, func +# INST: bnez.n a8, func +# FIXUP: fixup A - offset: 0, value: func, kind: fixup_xtensa_branch_6 + bge a14, a2, func # RELOC: R_XTENSA_SLOT0_OP # INST: bge a14, a2, func diff --git a/llvm/test/MC/Xtensa/code_density-invalid.s b/llvm/test/MC/Xtensa/code_density-invalid.s new file mode 100644 index 00000000000000..b5068cb8d57ab2 --- /dev/null +++ b/llvm/test/MC/Xtensa/code_density-invalid.s @@ -0,0 +1,21 @@ +# RUN: not llvm-mc -triple xtensa --mattr=+density %s 2>&1 | FileCheck %s + +LBL0: + +# Out of range immediates + +# imm1n_15 +addi.n a2, a3, 20 +# CHECK: :[[#@LINE-1]]:16: error: expected immediate in range [-1, 15] except 0 + +# imm1n_15 +addi.n a2, a3, 0 +# CHECK: :[[#@LINE-1]]:16: error: expected immediate in range [-1, 15] except 0 + +# imm32n_95 +movi.n a2, 100 +# CHECK: :[[#@LINE-1]]:12: error: expected immediate in range [-32, 95] + +# Offset4m32 +l32i.n a2, a3, 100 +# CHECK: :[[#@LINE-1]]:16: error: expected immediate in range [0, 60], first 2 bits should be zero diff --git a/llvm/test/MC/Xtensa/code_density.s b/llvm/test/MC/Xtensa/code_density.s new file mode 100644 index 00000000000000..fe9f7e91774487 --- /dev/null +++ b/llvm/test/MC/Xtensa/code_density.s @@ -0,0 +1,68 @@ +# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+density \ +# RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s + +.align 4 +LBL0: + +# Instruction format RRRN +# CHECK-INST: add.n a2, a3, a4 +# CHECK: encoding: [0x4a,0x23] +add.n a2, a3, a4 + +# Instruction format RRRN +# CHECK-INST: addi.n a2, a3, 3 +# CHECK: encoding: [0x3b,0x23] +addi.n a2, a3, 3 + +# Instruction format RRRN +# CHECK-INST: addi.n a2, a3, -1 +# CHECK: encoding: [0x0b,0x23] +addi.n a2, a3, -1 + +# Instruction format RI6 +# CHECK-INST: beqz.n a3, LBL1 +# CHECK: encoding: [0x8c'A',0x03'A'] +beqz.n a3, LBL1 + +# Instruction format RI6 +# CHECK-INST: bnez.n a3, LBL1 +# CHECK: encoding: [0xcc'A',0x03'A'] +bnez.n a3, LBL1 + +# Instruction format RRRN +# CHECK-INST: ill.n +# CHECK: encoding: [0x6d,0xf0] +ill.n + +# Instruction format RRRN +# CHECK-INST: l32i.n a2, a3, 12 +# CHECK: encoding: [0x28,0x33] +l32i.n a2, a3, 12 + +# Instruction format RRRN +# CHECK-INST: mov.n a2, a3 +# CHECK: encoding: [0x2d,0x03] +mov.n a2, a3 + +# Instruction format RI7 +# CHECK-INST: movi.n a2, -32 +# CHECK: encoding: [0x6c,0x02] +movi.n a2, -32 + +# Instruction format RRRN +# CHECK-INST: nop.n +# CHECK: encoding: [0x3d,0xf0] +nop.n + +# Instruction format RRRN +# CHECK-INST: ret.n +# CHECK: encoding: [0x0d,0xf0] +ret.n + +# Instruction format RRRN +# CHECK-INST: s32i.n a2, a3, 12 +# CHECK: encoding: [0x29,0x33] +s32i.n a2, a3, 12 + +.align 4 +LBL1: From 6f68010f9123aae9f6f105d7a11af22458518ad7 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 18 Dec 2024 20:40:33 +0800 Subject: [PATCH 26/37] [InstCombine] Drop samesign flags in `foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed` (#120373) Counterexamples: https://alive2.llvm.org/ce/z/6Ks8Qz Closes https://github.com/llvm/llvm-project/issues/120361. --- .../InstCombine/InstCombineAndOrXor.cpp | 10 ++++++-- .../Transforms/InstCombine/icmp-logical.ll | 25 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index dff9304be64ddb..e576eea4ca36a1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -455,14 +455,20 @@ static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed( // RHS. For example, // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). - if (IsSuperSetOrEqual(BCst, DCst)) + if (IsSuperSetOrEqual(BCst, DCst)) { + // We can't guarantee that samesign hold after this fold. + RHS->setSameSign(false); return RHS; + } // Otherwise, B is a subset of D. If B and E have a common bit set, // ie. (B & E) != 0, then LHS is subsumed by RHS. For example. // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8). assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code"); - if ((*BCst & ECst) != 0) + if ((*BCst & ECst) != 0) { + // We can't guarantee that samesign hold after this fold. + RHS->setSameSign(false); return RHS; + } // Otherwise, LHS and RHS contradict and the whole expression becomes false // (or true if negated.) For example, // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false. diff --git a/llvm/test/Transforms/InstCombine/icmp-logical.ll b/llvm/test/Transforms/InstCombine/icmp-logical.ll index 50feb51092fd9e..df8442e069b788 100644 --- a/llvm/test/Transforms/InstCombine/icmp-logical.ll +++ b/llvm/test/Transforms/InstCombine/icmp-logical.ll @@ -1900,3 +1900,28 @@ define i1 @masked_icmps_bmask_notmixed_not_subset_notoptimized(i32 %A) { %res = and i1 %tst1, %tst2 ret i1 %res } + +define i1 @pr120361(i8 %x, i8 %y) { +; CHECK-LABEL: @pr120361( +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], -1 +; CHECK-NEXT: ret i1 [[CMP1]] +; + %cmp1 = icmp samesign eq i8 %x, -1 + %cmp2 = icmp ne i8 %x, 0 + %result = select i1 %cmp2, i1 %cmp1, i1 false + ret i1 %result +} + +define i1 @pr120361_v2(i32 %x) { +; CHECK-LABEL: @pr120361_v2( +; CHECK-NEXT: [[AND2:%.*]] = and i32 [[X:%.*]], -113 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[AND2]], 15 +; CHECK-NEXT: ret i1 [[CMP2]] +; + %and1 = and i32 %x, 15 + %cmp1 = icmp ne i32 %and1, 0 + %and2 = and i32 %x, -113 + %cmp2 = icmp samesign eq i32 %and2, 15 + %and = select i1 %cmp1, i1 %cmp2, i1 false + ret i1 %and +} From 0c6860622c249ae7adc784c66a8d0b1335a9e7df Mon Sep 17 00:00:00 2001 From: Dhruv Srivastava Date: Wed, 18 Dec 2024 18:14:31 +0530 Subject: [PATCH 27/37] [lldb][AIX] Header Parsing for XCOFF Object File in AIX (#116338) This PR is in reference to porting LLDB on AIX. Link to discussions on llvm discourse and github: 1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640 2. https://github.com/llvm/llvm-project/issues/101657 The complete changes for porting are present in this draft PR: https://github.com/llvm/llvm-project/pull/102601 Added XCOFF Object File Header Parsing for AIX. Details about XCOFF file format on AIX: [XCOFF](https://www.ibm.com/docs/en/aix/7.3?topic=formats-xcoff-object-file-format) --- .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp | 61 +++++++++++++++++-- .../ObjectFile/XCOFF/ObjectFileXCOFF.h | 7 +++ .../Shell/ObjectFile/XCOFF/basic-info.yaml | 2 +- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp index 564e92f3934be7..b54d43c5dd7373 100644 --- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp @@ -79,9 +79,44 @@ ObjectFile *ObjectFileXCOFF::CreateInstance(const lldb::ModuleSP &module_sp, if (!objfile_up) return nullptr; + // Cache xcoff binary. + if (!objfile_up->CreateBinary()) + return nullptr; + + if (!objfile_up->ParseHeader()) + return nullptr; + return objfile_up.release(); } +bool ObjectFileXCOFF::CreateBinary() { + if (m_binary) + return true; + + Log *log = GetLog(LLDBLog::Object); + + auto binary = llvm::object::ObjectFile::createObjectFile( + llvm::MemoryBufferRef(toStringRef(m_data.GetData()), + m_file.GetFilename().GetStringRef()), + file_magic::xcoff_object_64); + if (!binary) { + LLDB_LOG_ERROR(log, binary.takeError(), + "Failed to create binary for file ({1}): {0}", m_file); + return false; + } + // Make sure we only handle XCOFF format. + m_binary = + llvm::unique_dyn_cast(std::move(*binary)); + if (!m_binary) + return false; + + LLDB_LOG(log, "this = {0}, module = {1} ({2}), file = {3}, binary = {4}", + this, GetModule().get(), GetModule()->GetSpecificationDescription(), + m_file.GetPath(), m_binary.get()); + + return true; +} + ObjectFile *ObjectFileXCOFF::CreateMemoryInstance( const lldb::ModuleSP &module_sp, WritableDataBufferSP data_sp, const lldb::ProcessSP &process_sp, lldb::addr_t header_addr) { @@ -108,10 +143,9 @@ size_t ObjectFileXCOFF::GetModuleSpecifications( static uint32_t XCOFFHeaderSizeFromMagic(uint32_t magic) { switch (magic) { - // TODO: 32bit not supported yet + // TODO: 32bit not supported. // case XCOFF::XCOFF32: // return sizeof(struct llvm::object::XCOFFFileHeader32); - case XCOFF::XCOFF64: return sizeof(struct llvm::object::XCOFFFileHeader64); break; @@ -127,19 +161,30 @@ bool ObjectFileXCOFF::MagicBytesMatch(DataBufferSP &data_sp, lldb::addr_t data_length) { lldb_private::DataExtractor data; data.SetData(data_sp, data_offset, data_length); + // Need to set this as XCOFF is only compatible with Big Endian data.SetByteOrder(eByteOrderBig); lldb::offset_t offset = 0; uint16_t magic = data.GetU16(&offset); return XCOFFHeaderSizeFromMagic(magic) != 0; } -bool ObjectFileXCOFF::ParseHeader() { return false; } +bool ObjectFileXCOFF::ParseHeader() { + // Only 64-bit is supported for now + return m_binary->fileHeader64()->Magic == XCOFF::XCOFF64; +} ByteOrder ObjectFileXCOFF::GetByteOrder() const { return eByteOrderBig; } bool ObjectFileXCOFF::IsExecutable() const { return true; } -uint32_t ObjectFileXCOFF::GetAddressByteSize() const { return 8; } +uint32_t ObjectFileXCOFF::GetAddressByteSize() const { + // 32-bit not supported. return 8 for 64-bit XCOFF::XCOFF64 + return 8; +} + +AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) { + return AddressClass::eUnknown; +} void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {} @@ -159,7 +204,13 @@ UUID ObjectFileXCOFF::GetUUID() { return UUID(); } uint32_t ObjectFileXCOFF::GetDependentModules(FileSpecList &files) { return 0; } -ObjectFile::Type ObjectFileXCOFF::CalculateType() { return eTypeExecutable; } +ObjectFile::Type ObjectFileXCOFF::CalculateType() { + if (m_binary->fileHeader64()->Flags & XCOFF::F_EXEC) + return eTypeExecutable; + else if (m_binary->fileHeader64()->Flags & XCOFF::F_SHROBJ) + return eTypeSharedLibrary; + return eTypeUnknown; +} ObjectFile::Strata ObjectFileXCOFF::CalculateStrata() { return eStrataUnknown; } diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h index c0ce885f704241..2d4f9f3f2dab80 100644 --- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h +++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.h @@ -68,6 +68,8 @@ class ObjectFileXCOFF : public lldb_private::ObjectFile { uint32_t GetAddressByteSize() const override; + lldb_private::AddressClass GetAddressClass(lldb::addr_t file_addr) override; + void ParseSymtab(lldb_private::Symtab &symtab) override; bool IsStripped() override; @@ -99,6 +101,11 @@ class ObjectFileXCOFF : public lldb_private::ObjectFile { static lldb::WritableDataBufferSP MapFileDataWritable(const lldb_private::FileSpec &file, uint64_t Size, uint64_t Offset); + +private: + bool CreateBinary(); + + std::unique_ptr m_binary; }; #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_XCOFF_OBJECTFILE_H diff --git a/lldb/test/Shell/ObjectFile/XCOFF/basic-info.yaml b/lldb/test/Shell/ObjectFile/XCOFF/basic-info.yaml index 761d66a6045d93..3c0037db36dbbc 100644 --- a/lldb/test/Shell/ObjectFile/XCOFF/basic-info.yaml +++ b/lldb/test/Shell/ObjectFile/XCOFF/basic-info.yaml @@ -13,7 +13,7 @@ FileHeader: MagicNumber: 0x1F7 NumberOfSections: 1 CreationTime: 000000000 - Flags: 0x0000 + Flags: 0x0002 Sections: - Name: .text Address: 0x100000438 From 0446990cc7af4e2b794660a98214edb401d6c50a Mon Sep 17 00:00:00 2001 From: Aaditya <115080342+easyonaadit@users.noreply.github.com> Date: Wed, 18 Dec 2024 18:20:45 +0530 Subject: [PATCH 28/37] Reapply "[NFC][AMDGPU] Pre-commit clang and llvm tests for dynamic allocas" (#120410) This reapplies commit https://github.com/llvm/llvm-project/pull/120063. A machine-verifier bug was causing a crash in the previous commit. This has been addressed in https://github.com/llvm/llvm-project/pull/120393. --- .../GlobalISel/dynamic-alloca-divergent.ll | 44 +++++ .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 184 +++++++++++++++++- 2 files changed, 226 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll index cfe5d1c194f420..aefcad491073fc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll @@ -13,6 +13,31 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(ptr addrspace(1 ret void } +; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_default_align) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_default_align +; ERR-NEXT: error: :0:0: in function kernel_dynamic_stackalloc_vgpr_default_align void (ptr addrspace(1)): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_default_align(ptr addrspace(1) %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id + %n = load i32, ptr addrspace(1) %gep + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} +; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: kernel_dynamic_stackalloc_vgpr_align64) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align64 +; ERR-NEXT: error: :0:0: in function kernel_dynamic_stackalloc_vgpr_align64 void (ptr addrspace(1)): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align64(ptr addrspace(1) %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 %id + %n = load i32, ptr addrspace(1) %gep + %alloca = alloca i32, i32 %n, align 64, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + ; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) ; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 ; ERR-NEXT: error: :0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca @@ -23,6 +48,25 @@ define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { ret void } +; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_default_align) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_default_align +; ERR-NEXT: error: :0:0: in function func_dynamic_stackalloc_vgpr_default_align void (i32): unsupported dynamic alloca + +define void @func_dynamic_stackalloc_vgpr_default_align(i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 456, ptr addrspace(5) %alloca + ret void +} +; ERR: remark: :0:0: cannot select: %{{[0-9]+}}:sreg_32(p5) = G_DYN_STACKALLOC %{{[0-9]+}}:vgpr(s32), 64 (in function: func_dynamic_stackalloc_vgpr_align64) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align64 +; ERR-NEXT: error: :0:0: in function func_dynamic_stackalloc_vgpr_align64 void (i32): unsupported dynamic alloca + +define void @func_dynamic_stackalloc_vgpr_align64(i32 %n) { + %alloca = alloca i32, i32 %n, align 64, addrspace(5) + store volatile i32 456, ptr addrspace(5) %alloca + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index 1c093bf31ea75f..73aa87e5c55d20 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -5,8 +5,188 @@ target datalayout = "A5" ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca -define amdgpu_kernel void @test_dynamic_stackalloc(ptr addrspace(1) %out, i32 %n) { +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { %alloca = alloca i32, i32 %n, addrspace(5) - store volatile i32 0, ptr addrspace(5) %alloca + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_over_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 128, addrspace(5) + store volatile i32 10, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_under_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 2, addrspace(5) + store volatile i32 22, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca float, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 128, addrspace(5) + store volatile i32 444, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i128, i32 %idx, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + %alloca1 = alloca i32, i32 8, addrspace(5) + %alloca2 = alloca i17, i32 %n, addrspace(5) + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca3 = alloca i32, i32 %m, align 64, addrspace(5) + %alloca4 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 3, ptr addrspace(5) %alloca3 + store volatile i32 4, ptr addrspace(5) %alloca4 + br label %bb.1 +bb.1: + store volatile i32 1, ptr addrspace(5) %alloca1 + store volatile i32 2, ptr addrspace(5) %alloca2 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %alloca2 = alloca i32, i32 %m, align 64, addrspace(5) + store volatile i32 2, ptr addrspace(5) %alloca2 + br label %bb.2 +bb.1: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca1 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + br label %bb.2 +bb.2: + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform(i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 128, addrspace(5) + store volatile i32 10, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { + %alloca = alloca i32, i32 %n, align 2, addrspace(5) + store volatile i32 22, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, addrspace(5) + store volatile i32 123, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent_over_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 128, addrspace(5) + store volatile i32 444, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_divergent_under_aligned() { + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca = alloca i32, i32 %idx, align 2, addrspace(5) + store volatile i32 666, ptr addrspace(5) %alloca + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + %alloca1 = alloca i32, i32 8, addrspace(5) + %alloca2 = alloca i32, i32 %n, addrspace(5) + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca3 = alloca i32, i32 %m, align 64, addrspace(5) + %alloca4 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 3, ptr addrspace(5) %alloca3 + store volatile i32 4, ptr addrspace(5) %alloca4 + br label %bb.1 +bb.1: + store volatile i32 1, ptr addrspace(5) %alloca1 + store volatile i32 2, ptr addrspace(5) %alloca2 + ret void +} + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { +entry: + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.0, label %bb.1 +bb.0: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %alloca1 = alloca i32, i32 %idx, align 4, addrspace(5) + store volatile i32 1, ptr addrspace(5) %alloca1 + br label %bb.2 +bb.1: + %alloca2 = alloca i32, i32 %m, align 64, addrspace(5) + store volatile i32 2, ptr addrspace(5) %alloca2 + br label %bb.2 +bb.2: ret void } From 6da676ad35863ecea004ffa4059297a5c86dc6b2 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 18 Dec 2024 12:44:20 +0000 Subject: [PATCH 29/37] [AMDGPU] Use -triple instead of -arch in MC tests --- llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s | 4 ++-- llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s | 2 +- llvm/test/MC/AMDGPU/gfx950_asm_vop3.s | 8 ++++---- llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt | 2 +- .../MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt | 2 +- llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt | 2 +- llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 2 +- llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_xdlops.txt | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s index 93d015f790c862..a6907caafcbb61 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s @@ -1,5 +1,5 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940-ERR --implicit-check-not=error: %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940-ERR --implicit-check-not=error: %s ds_read_b64_tr_b4 v[0:1], v1 // GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s index 301750689bc782..bad61e1e30103f 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop1_dpp16.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefixes=GFX950 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefixes=GFX950 %s v_prng_b32 v5, v1 quad_perm:[3,2,1,0] // GFX950: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb0,0x0a,0x7e,0x01,0x1b,0x00,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s index 5f5e5057117059..c271d12579f343 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s @@ -1,7 +1,7 @@ -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx906 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX906-ERR %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx940 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX940-ERR %s -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding < %s | FileCheck --check-prefix=GFX950 %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx906 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX906-ERR %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX940-ERR %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding < %s | FileCheck --check-prefix=GFX950 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s v_cvt_pk_bf16_f32 v5, v1, v2 // GFX906-ERR: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt index 0697ee8661e76d..b0f3a8af8f3fbd 100644 --- a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt +++ b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s +# RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s # GFX950: warning: invalid instruction encoding 0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt index 1efd2d7b996d48..10310f7ad1f3de 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -check-prefix=GFX950 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -check-prefix=GFX950 %s # GFX950: ds_read_b64_tr_b4 v[0:1], v0 ; encoding: [0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00] 0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt index 336a26907891a2..ac225355be6b4f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop1.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s # GFX950: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0xb1,0x0a,0x7e] 0x01,0xb1,0x0a,0x7e diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt index 7cd97ac87057e7..97bc68b0774b1c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s # GFX950: v_cvt_pk_bf16_f32 v5, v1, v2 ; encoding: [0x05,0x00,0x68,0xd2,0x01,0x05,0x02,0x00] 0x05,0x00,0x68,0xd2,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_xdlops.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_xdlops.txt index 53b0bcb0aa1ae7..059c8da66a49ab 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_xdlops.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_xdlops.txt @@ -1,4 +1,4 @@ -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX950 %s # GFX950: v_dot2c_f32_bf16_e32 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x2c] 0x01,0x05,0x0a,0x2c From e7303fe80a0bea124422219356c1c9e845110a77 Mon Sep 17 00:00:00 2001 From: Oliver Stannard Date: Wed, 18 Dec 2024 12:58:21 +0000 Subject: [PATCH 30/37] [Python] Use raw string literals for regexes (#120401) Previously these backslashes were not followed by a valid escape sequence character so were treated as literal backslashes, which was the intended behaviour of the code. However python as of 3.12 has started warning about these, so we should use raw string literals for regexes so that backslashes are always interpreted literally. I've done this for every regex in this file for consistency, including the ones which do not contain backslashes. --- llvm/utils/extract_symbols.py | 38 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llvm/utils/extract_symbols.py b/llvm/utils/extract_symbols.py index 684e124c762594..388723421d6602 100755 --- a/llvm/utils/extract_symbols.py +++ b/llvm/utils/extract_symbols.py @@ -53,12 +53,12 @@ def nm_get_symbols(tool, lib): # The -P flag displays the size field for symbols only when applicable, # so the last field is optional. There's no space after the value field, # but \s+ match newline also, so \s+\S* will match the optional size field. - match = re.match("^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line) + match = re.match(r"^(\S+)\s+[BDGRSTuVW]\s+\S+\s+\S*$", line) if match: yield (match.group(1), True) # Look for undefined symbols, which have type U and may or may not # (depending on which nm is being used) have value and size. - match = re.match("^(\S+)\s+U\s+(\S+\s+\S*)?$", line) + match = re.match(r"^(\S+)\s+U\s+(\S+\s+\S*)?$", line) if match: yield (match.group(1), False) process.wait() @@ -71,7 +71,7 @@ def readobj_is_32bit_windows(tool, lib): [tool, "--file-header", lib], universal_newlines=True ) for line in output.splitlines(): - match = re.match("Format: (\S+)", line) + match = re.match(r"Format: (\S+)", line) if match: return match.group(1) == "COFF-i386" return False @@ -85,7 +85,7 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration): if not "?" in symbol: if calling_convention_decoration: # Remove calling convention decoration from names - match = re.match("[_@]([^@]+)", symbol) + match = re.match(r"[_@]([^@]+)", symbol) if match: symbol = match.group(1) # Discard floating point/SIMD constants. @@ -100,10 +100,10 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration): # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol # that mentions an anonymous namespace can be discarded, as the anonymous # namespace doesn't exist outside of that translation unit. - elif re.search("\?A(0x\w+)?@", symbol): + elif re.search(r"\?A(0x\w+)?@", symbol): return None # Skip X86GenMnemonicTables functions, they are not exposed from llvm/include/. - elif re.match("\?is[A-Z0-9]*@X86@llvm", symbol): + elif re.match(r"\?is[A-Z0-9]*@X86@llvm", symbol): return None # Keep mangled llvm:: and clang:: function symbols. How we detect these is a # bit of a mess and imprecise, but that avoids having to completely demangle @@ -123,7 +123,7 @@ def should_keep_microsoft_symbol(symbol, calling_convention_decoration): # ::= .+@ (list of types) # ::= .*Z (list of types, varargs) # ::= exceptions are not allowed - elif re.search("(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol): + elif re.search(r"(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$", symbol): return symbol return None @@ -140,7 +140,7 @@ def should_keep_itanium_symbol(symbol, calling_convention_decoration): if not symbol.startswith("_") and not symbol.startswith("."): return symbol # Discard manglings that aren't nested names - match = re.match("\.?_Z(T[VTIS])?(N.+)", symbol) + match = re.match(r"\.?_Z(T[VTIS])?(N.+)", symbol) if not match: return None # Demangle the name. If the name is too complex then we don't need to keep @@ -169,19 +169,19 @@ class TooComplexName(Exception): # (name, rest of string) pair. def parse_itanium_name(arg): # Check for a normal name - match = re.match("(\d+)(.+)", arg) + match = re.match(r"(\d+)(.+)", arg) if match: n = int(match.group(1)) name = match.group(1) + match.group(2)[:n] rest = match.group(2)[n:] return name, rest # Check for constructor/destructor names - match = re.match("([CD][123])(.+)", arg) + match = re.match(r"([CD][123])(.+)", arg) if match: return match.group(1), match.group(2) # Assume that a sequence of characters that doesn't end a nesting is an # operator (this is very imprecise, but appears to be good enough) - match = re.match("([^E]+)(.+)", arg) + match = re.match(r"([^E]+)(.+)", arg) if match: return match.group(1), match.group(2) # Anything else: we can't handle it @@ -196,13 +196,13 @@ def skip_itanium_template(arg): tmp = arg[1:] while tmp: # Check for names - match = re.match("(\d+)(.+)", tmp) + match = re.match(r"(\d+)(.+)", tmp) if match: n = int(match.group(1)) tmp = match.group(2)[n:] continue # Check for substitutions - match = re.match("S[A-Z0-9]*_(.+)", tmp) + match = re.match(r"S[A-Z0-9]*_(.+)", tmp) if match: tmp = match.group(1) # Start of a template @@ -231,14 +231,14 @@ def parse_itanium_nested_name(arg): ret = [] # Skip past the N, and possibly a substitution - match = re.match("NS[A-Z0-9]*_(.+)", arg) + match = re.match(r"NS[A-Z0-9]*_(.+)", arg) if match: tmp = match.group(1) else: tmp = arg[1:] # Skip past CV-qualifiers and ref qualifiers - match = re.match("[rVKRO]*(.+)", tmp) + match = re.match(r"[rVKRO]*(.+)", tmp) if match: tmp = match.group(1) @@ -280,19 +280,19 @@ def parse_microsoft_mangling(arg): if arg.startswith("@"): return components # Check for a simple name - match = re.match("(\w+)@(.+)", arg) + match = re.match(r"(\w+)@(.+)", arg) if match: components.append((match.group(1), False)) arg = match.group(2) continue # Check for a special function name - match = re.match("(\?_?\w)(.+)", arg) + match = re.match(r"(\?_?\w)(.+)", arg) if match: components.append((match.group(1), False)) arg = match.group(2) continue # Check for a template name - match = re.match("\?\$(\w+)@[^@]+@(.+)", arg) + match = re.match(r"\?\$(\w+)@[^@]+@(.+)", arg) if match: components.append((match.group(1), True)) arg = match.group(2) @@ -323,7 +323,7 @@ def get_template_name(sym, mangling): if mangling == "microsoft": names = parse_microsoft_mangling(sym) else: - match = re.match("\.?_Z(T[VTIS])?(N.+)", sym) + match = re.match(r"\.?_Z(T[VTIS])?(N.+)", sym) if match: names, _ = parse_itanium_nested_name(match.group(2)) else: From 4b56345895729fda3bc3c094bc3f237ba3a49686 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Wed, 18 Dec 2024 13:24:47 +0000 Subject: [PATCH 31/37] [mlir][SCF] Unify tileUsingFor and tileReductionUsingFor implementation (#120115) This patch unifies the tiling implementation for tileUsingFor and tileReductionUsingFor. This is done by passing an addition option to SCFTilingOptions, allowing it to set how reduction dimensions should be tiled. Currently, there are 3 different options for reduction tiling: FullReduction (old tileUsingFor), PartialReductionOuterReduction (old tileReductionUsingFor) and PartialReductionOuterParallel (linalg::tileReductionUsingForall, this isn't implemented in this patch). The patch makes tileReductionUsingFor use the tileUsingFor implementation with the new reduction tiling options. There are no test changes because the implementation was doing almost the exactly same thing. This was also tested in IREE (which uses both these APIs heavily) and there were no test changes. --- .../SCF/Transforms/TileUsingInterface.h | 57 ++- .../TransformOps/LinalgTransformOps.cpp | 13 +- .../SCF/Transforms/TileUsingInterface.cpp | 457 ++++++++++-------- .../TestTilingInterfaceTransformOps.cpp | 3 +- 4 files changed, 305 insertions(+), 225 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h index 9f5f9f3fca97ad..d2cddfe00ac78e 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h @@ -85,6 +85,36 @@ struct SCFTilingOptions { return *this; } + /// Specify how reduction dimensions should be tiled. + /// + /// Tiling can be thought of as splitting a dimension into 2 and materializing + /// the outer dimension as a loop: + /// + /// op[original] -> op[original / x, x] -> loop[original] { op[x] } + /// + /// For parallel dimensions, the split can only happen in one way, with both + /// dimensions being parallel. For reduction dimensions however, there is a + /// choice in how we split the reduction dimension. This enum exposes this + /// choice. + enum class ReductionTilingStrategy { + // [reduction] -> [reduction1, reduction2] + // -> loop[reduction1] { [reduction2] } + FullReduction, + // [reduction] -> [reduction1, parallel2] + // -> loop[reduction1] { [parallel2] }; merge[reduction1] + PartialReductionOuterReduction, + // [reduction] -> [parallel1, reduction2] + // -> loop[parallel1] { [reduction2] }; merge[parallel1] + PartialReductionOuterParallel + }; + ReductionTilingStrategy reductionStrategy = + ReductionTilingStrategy::FullReduction; + SCFTilingOptions & + setReductionTilingStrategy(ReductionTilingStrategy strategy) { + reductionStrategy = strategy; + return *this; + } + /// Specify mapping of loops to devices. This is only respected when the loop /// constructs support such a mapping (like `scf.forall`). Will be ignored /// when using loop constructs that dont support such a mapping (like @@ -102,11 +132,16 @@ struct SCFTilingResult { /// matter except the last op. The replacements are expected to be the results /// of the last op. SmallVector tiledOps; + /// The initial destination values passed to the tiled operations. + SmallVector initialValues; /// The `scf.for` operations that iterate over the tiles. SmallVector loops; - /// Values to use as replacements for the untiled op. Is the same size as the - /// number of results of the untiled op. - SmallVector replacements; + /// The result generated by the loop nest in tiling, may hold partial results, + /// which need to be merged to match the computation of the untiled operation. + /// `mergeResult` contains the operations used to perform this merge from + /// partial results and the values that can be used as replacements of + /// the untiled operation. + MergeResult mergeResult; /// Slices generated after tiling that can be used for fusing with the tiled /// producer. SmallVector generatedSlices; @@ -300,20 +335,6 @@ tileAndFuseConsumerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp); FailureOr> lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op); -/// Transformation information returned after reduction tiling. -struct SCFReductionTilingResult { - /// The partial reduction tiled op generated. - SmallVector parallelTiledOps; - /// The final reduction operation merging all the partial reductions. - SmallVector mergeOps; - /// Initial values used for reduction. - SmallVector initialValues; - /// The loop operations that iterate over the tiles. - SmallVector loops; - /// The replacements to use for the results of the tiled operation. - SmallVector replacements; -}; - /// Method to tile a reduction and generate a parallel op within a serial loop. /// Each of the partial reductions are calculated in parallel. Then after the /// loop all the partial reduction are merged into a final reduction. @@ -338,7 +359,7 @@ struct SCFReductionTilingResult { /// %6 = linalg.generic %1 ["parallel", "reduction"] /// : tensor<7x4xf32> -> tensor<7xf32> /// ``` -FailureOr +FailureOr tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef tileSize); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 8397652d1d8a8a..18fd24da395b76 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -2223,7 +2223,7 @@ transform::ScalarizeOp::applyToOne(transform::TransformRewriter &rewriter, return emitDefaultDefiniteFailure(target); if (target->getNumResults()) - rewriter.replaceOp(target, maybeTilingResult->replacements); + rewriter.replaceOp(target, maybeTilingResult->mergeResult.replacements); else rewriter.eraseOp(target); @@ -2630,17 +2630,18 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForOp::applyToOne( transform::ApplyToEachResultList &results, transform::TransformState &state) { rewriter.setInsertionPoint(target); - FailureOr result = scf::tileReductionUsingScf( + FailureOr result = scf::tileReductionUsingScf( rewriter, cast(target.getOperation()), getAsOpFoldResult(rewriter.getI64ArrayAttr(getTileSizes()))); if (failed(result)) return emitDefaultSilenceableFailure(target); + rewriter.replaceOp(target, result->mergeResult.replacements); for (Value initValue : result->initialValues) results.push_back(initValue.getDefiningOp()); - for (auto parallelTiledOp : result->parallelTiledOps) + for (auto parallelTiledOp : result->tiledOps) results.push_back(parallelTiledOp); - for (auto mergeOp : result->mergeOps) + for (auto mergeOp : result->mergeResult.mergeOps) results.push_back(mergeOp); results.push_back(result->loops.front()); return DiagnosedSilenceableFailure::success(); @@ -3064,7 +3065,7 @@ transform::TileUsingForOp::apply(transform::TransformRewriter &rewriter, if (failed(maybeTilingResult)) return DiagnosedSilenceableFailure::definiteFailure(); - rewriter.replaceOp(op, maybeTilingResult->replacements); + rewriter.replaceOp(op, maybeTilingResult->mergeResult.replacements); tiled.append(maybeTilingResult->tiledOps); for (const auto &en2 : llvm::enumerate(maybeTilingResult->loops)) @@ -3303,7 +3304,7 @@ DiagnosedSilenceableFailure transform::tileToForallOpImpl( if (failed(maybeTilingResult)) return transformOp.emitDefaultSilenceableFailure(tileableOp); - rewriter.replaceOp(tileableOp, maybeTilingResult->replacements); + rewriter.replaceOp(tileableOp, maybeTilingResult->mergeResult.replacements); tilingResult = *maybeTilingResult; diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 6a4a6b43933806..ef5d4370e78102 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -570,6 +570,144 @@ static LogicalResult generateLoopNest( return rewriter.notifyMatchFailure(loc, "unhandled loop type"); } +static FailureOr> +createInitialTensorsForTiling(RewriterBase &rewriter, TilingInterface op, + ArrayRef tileSizes, + const scf::SCFTilingOptions &options) { + SmallVector initTensors; + Location loc = op->getLoc(); + switch (options.reductionStrategy) { + case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + if (failed(tensor::getOrCreateDestinations(rewriter, loc, op, initTensors))) + return failure(); + return initTensors; + case scf::SCFTilingOptions::ReductionTilingStrategy:: + PartialReductionOuterReduction: { + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only supported" + "for operations implementing PartialReductionOpInterface"); + } + // Get reduction dimensions. + // TODO: PartialReductionOpInterface should really query TilingInterface + // itself and find reduction dimensions. + SmallVector reductionDims; + for (auto [idx, iteratorType] : + llvm::enumerate(op.getLoopIteratorTypes())) { + if (iteratorType == utils::IteratorType::reduction) + reductionDims.push_back(idx); + } + return redOp.generateInitialTensorForPartialReduction( + rewriter, loc, tileSizes, reductionDims); + } + default: + return rewriter.notifyMatchFailure(op, + "unhandled reduction tiling strategy"); + } +} + +static FailureOr +getTiledImplementation(RewriterBase &rewriter, TilingInterface op, + ValueRange regionIterArg, ArrayRef offsets, + ArrayRef sizes, + const scf::SCFTilingOptions &options) { + switch (options.reductionStrategy) { + case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + return op.getTiledImplementation(rewriter, offsets, sizes); + case scf::SCFTilingOptions::ReductionTilingStrategy:: + PartialReductionOuterReduction: { + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only " + "supported for operations " + "implementing PartialReductionOpInterface"); + } + // Get reduction dimensions. + // TODO: PartialReductionOpInterface should really query TilingInterface + // itself and find reduction dimensions. + SmallVector reductionDims; + for (auto [idx, iteratorType] : + llvm::enumerate(op.getLoopIteratorTypes())) { + if (iteratorType == utils::IteratorType::reduction) + reductionDims.push_back(idx); + } + return redOp.tileToPartialReduction(rewriter, op.getLoc(), regionIterArg, + offsets, sizes, reductionDims); + } + default: + return rewriter.notifyMatchFailure(op, + "unhandled reduction tiling strategy"); + } +} + +static LogicalResult +getResultTilePosition(RewriterBase &rewriter, int64_t index, Value tiledResult, + TilingInterface op, ArrayRef offsets, + ArrayRef sizes, + SmallVector &resultOffset, + SmallVector &resultSize, + const scf::SCFTilingOptions &options) { + + switch (options.reductionStrategy) { + case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + return op.getResultTilePosition(rewriter, index, offsets, sizes, + resultOffset, resultSize); + case scf::SCFTilingOptions::ReductionTilingStrategy:: + PartialReductionOuterReduction: { + // TODO: This does not work for non identity accesses to the result tile. + // The proper fix is to add a getPartialResultTilePosition method to + // PartialReductionOpInterface. + resultOffset = + SmallVector(offsets.size(), rewriter.getIndexAttr(0)); + for (size_t i = 0; i < offsets.size(); i++) { + resultSize.push_back( + tensor::getMixedSize(rewriter, op.getLoc(), tiledResult, i)); + } + return success(); + default: + return rewriter.notifyMatchFailure(op, + "unhandled reduction tiling strategy"); + } + } +} + +static FailureOr +mergeTilingResults(RewriterBase &rewriter, TilingInterface op, + ValueRange partialResults, + const scf::SCFTilingOptions &options) { + switch (options.reductionStrategy) { + case scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction: + // No need to merge results for reduction tiling strategy. + return MergeResult{{}, partialResults}; + case scf::SCFTilingOptions::ReductionTilingStrategy:: + PartialReductionOuterReduction: { + auto redOp = dyn_cast(op.getOperation()); + if (!redOp) { + return rewriter.notifyMatchFailure( + op, "PartialReductionOuterReduction tiling strategy is only " + "supported for operations " + "implementing PartialReductionOpInterface"); + } + // Get reduction dimensions. + // TODO: PartialReductionOpInterface should really query TilingInterface + // itself and find reduction dimensions. + SmallVector reductionDims; + for (auto [idx, iteratorType] : + llvm::enumerate(op.getLoopIteratorTypes())) { + if (iteratorType == utils::IteratorType::reduction) + reductionDims.push_back(idx); + } + return redOp.mergeReductions(rewriter, op.getLoc(), partialResults, + reductionDims); + } + default: + return rewriter.notifyMatchFailure(op, + "unhandled reduction tiling strategy"); + } +} + /// Append the specified additional `newInitOperands` operands to the /// loops existing `init` operands (or similar), and replace `loopOp` with /// the new loop that has the additional init operands. The loop body of @@ -710,11 +848,11 @@ FailureOr yieldTiledValuesAndReplaceLoop( }); } -/// Method to add new init values to a loop nest. Updates `loops` in-place with -/// new loops that use the `newInitValues`. -/// The outer-loops are updated to yield the new result values of the inner -/// loop. For the innermost loop, the call back `getNewYields` is invoked to get -/// the additional values to yield form the innermost loop. +/// Method to add new init values to a loop nest. Updates `loops` in-place +/// with new loops that use the `newInitValues`. The outer-loops are updated +/// to yield the new result values of the inner loop. For the innermost loop, +/// the call back `getNewYields` is invoked to get the additional values to +/// yield form the innermost loop. static LogicalResult addInitOperandsToLoopNest( RewriterBase &rewriter, MutableArrayRef loops, ValueRange newInitValues, YieldTiledValuesFn getNewTiledYieldsFn) { @@ -852,9 +990,9 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, auto clonedOp = cast( cloneOpAndUpdateDestinationArgs(rewriter, op, regionIterArgs)); - // 5b. Early return cloned op if tiling is not happening. We can not return - // the original op because it could lead to - // `rewriter.replaceOp(op, op->getResults())` and users would get crash. + // 5b. Early return cloned op if tiling is not happening. We can not + // return the original op because it could lead to `rewriter.replaceOp(op, + // op->getResults())` and users would get crash. if (llvm::all_of(tileSizes, isZeroIndex)) { tiledResults.append(clonedOp->result_begin(), clonedOp->result_end()); tilingResult = @@ -864,7 +1002,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, } // 5c. Tile the cloned operation. - tilingResult = clonedOp.getTiledImplementation(rewriter, offsets, sizes); + tilingResult = getTiledImplementation(rewriter, clonedOp, regionIterArgs, + offsets, sizes, options); if (failed(tilingResult)) { rewriter.eraseOp(clonedOp); return op.emitOpError("faild to tile operation"); @@ -879,8 +1018,9 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, llvm::enumerate(tilingResult->tiledValues)) { tiledResults.push_back(tiledValue); SmallVector resultOffset, resultSize; - if (failed(op.getResultTilePosition(rewriter, index, offsets, sizes, - resultOffset, resultSize))) { + if (failed(getResultTilePosition(rewriter, index, tiledValue, op, offsets, + sizes, resultOffset, resultSize, + options))) { for (auto op : tilingResult->tiledOps) { rewriter.eraseOp(op); } @@ -895,158 +1035,65 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, }; // 6. Find the destination tensors to use for the operation. - SmallVector destinationTensors; - if (failed(tensor::getOrCreateDestinations(rewriter, op.getLoc(), op, - destinationTensors))) { - return rewriter.notifyMatchFailure(op, - "unable to create destination tensors"); + FailureOr> maybeInits = + createInitialTensorsForTiling(rewriter, op, tileSizes, options); + if (failed(maybeInits)) { + return rewriter.notifyMatchFailure( + op, "unable to create initial tensors for tiling"); } + SmallVector &initTensors = maybeInits.value(); // 7. Generate the tiled loops nest using the callback defined above. SmallVector loops; if (failed(generateLoopNest(rewriter, op.getLoc(), options, iterationDomain, - tileSizes, numThreads, destinationTensors, + tileSizes, numThreads, initTensors, innerYieldTiledValuesFn, loops))) return op.emitOpError("failed to generate tiling loops"); assert(succeeded(tilingResult) && "expected tiling result to be computed after loop generation"); - // If loops are empty, the tiled op is used as the replacement for the untiled - // op. + SmallVector partialResults; if (loops.empty()) { - return scf::SCFTilingResult{tilingResult->tiledOps, loops, - tilingResult->tiledValues, - tilingResult->generatedSlices}; + // If loops are empty, the tiled op is used as the replacement for the + // untiled op. + partialResults = tilingResult->tiledValues; + } else { + partialResults = llvm::map_to_vector(loops.front()->getResults(), + [](OpResult r) -> Value { return r; }); } - SmallVector replacements = llvm::map_to_vector( - loops.front()->getResults(), [](OpResult r) -> Value { return r; }); - return scf::SCFTilingResult{tilingResult->tiledOps, loops, replacements, + FailureOr mergeResult = + mergeTilingResults(rewriter, op, partialResults, options); + if (failed(mergeResult)) { + return rewriter.notifyMatchFailure( + op, "Failed to merge partial results from tiling"); + } + + return scf::SCFTilingResult{tilingResult->tiledOps, initTensors, loops, + mergeResult.value(), tilingResult->generatedSlices}; } -FailureOr +FailureOr mlir::scf::tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef tileSizes) { - Location loc = op.getLoc(); - // Ops implementing PartialReductionOpInterface are expected to implement - // TilingInterface. - auto tilingInterfaceOp = cast(op.getOperation()); - SmallVector iterationDomain = tilingInterfaceOp.getIterationDomain(b); - auto tileSizesVector = llvm::to_vector(tileSizes); - if (tileSizesVector.size() < iterationDomain.size()) { - auto zero = b.getIndexAttr(0); - tileSizesVector.append(iterationDomain.size() - tileSizesVector.size(), - zero); - } - SmallVector iterators = - tilingInterfaceOp.getLoopIteratorTypes(); - - SmallVector reductionDims; - for (auto [idx, iteratorType] : - llvm::enumerate(tilingInterfaceOp.getLoopIteratorTypes())) { - if (iteratorType == utils::IteratorType::reduction) - reductionDims.push_back(idx); - } - - // 2. create the inital tensor value. - FailureOr> maybeInitTensors = - op.generateInitialTensorForPartialReduction(b, loc, tileSizesVector, - reductionDims); - if (failed(maybeInitTensors)) { - return b.notifyMatchFailure(op, "Failed to create initial tensors."); - } - SmallVector &initTensors = maybeInitTensors.value(); - - // 3. Define the callback to use for generating the inner most tile loop body. - SmallVector parallelTiledOps; - auto innerYieldTiledValuesFn = - [&](RewriterBase &rewriter, Location loc, ValueRange ivs, - ValueRange regionIterArgs, SmallVector &tiledResult, - SmallVector> &resultOffsets, - SmallVector> &resultSizes) - -> LogicalResult { - SmallVector offsets, sizes; - { - int materializedLoopNum = 0; - for (auto [tileSize, loopRange] : - llvm::zip_equal(tileSizesVector, iterationDomain)) { - if (isConstantIntValue(tileSize, 0)) { - offsets.push_back(loopRange.offset); - sizes.push_back(loopRange.size); - continue; - } - Value iv = ivs[materializedLoopNum++]; - offsets.push_back(iv); - sizes.push_back( - getBoundedTileSize(rewriter, loc, loopRange, iv, tileSize)); - } - } - - // 4a. Clone the operation. - { - auto clonedOp = cast( - cloneOpAndUpdateDestinationArgs(b, op, regionIterArgs)); - - // 4b. Tile the cloned operation. - FailureOr partialTilingResult = - clonedOp.tileToPartialReduction(b, loc, regionIterArgs, offsets, - sizes, reductionDims); - if (failed(partialTilingResult)) { - return failure(); - } - std::swap(parallelTiledOps, partialTilingResult->tiledOps); - std::swap(tiledResult, partialTilingResult->tiledValues); - - // 4c. Delete the cloned operation. - b.eraseOp(clonedOp); - } - - // 4d. Compute the offsets and sizes needed to insert the result of the - // tiled value back into destination before yielding the destination. - for (auto result : tiledResult) { - SmallVector outOffsets(offsets.size(), b.getIndexAttr(0)); - resultOffsets.emplace_back(std::move(outOffsets)); - - SmallVector outSizes; - for (size_t i = 0; i < offsets.size(); i++) { - outSizes.push_back(tensor::getMixedSize(b, loc, result, i)); - } - resultSizes.emplace_back(std::move(outSizes)); - } - return success(); - }; - - // 5. Generate the tiled implementation using the destination tensors. - SmallVector loops; - scf::SCFTilingOptions options; - options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp); - if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector, - /*numThreads=*/ArrayRef{}, - initTensors, innerYieldTiledValuesFn, loops))) - return b.notifyMatchFailure(op, "failed to tile for parallel reduction"); - - SmallVector replacements = llvm::map_to_vector( - loops.front()->getResults(), [](OpResult r) -> Value { return r; }); - - // 5. Apply the merge reduction to combine all the partial values. - b.setInsertionPointAfter(*loops.begin()); - FailureOr mergeResult = - op.mergeReductions(b, loc, replacements, reductionDims); - if (failed(mergeResult)) { - return failure(); - } - b.replaceOp(op, mergeResult->replacements); - - SCFReductionTilingResult reductionTilingResult; - std::swap(reductionTilingResult.parallelTiledOps, parallelTiledOps); - std::swap(reductionTilingResult.mergeOps, mergeResult->mergeOps); - std::swap(reductionTilingResult.initialValues, initTensors); - std::swap(reductionTilingResult.loops, loops); - std::swap(reductionTilingResult.replacements, mergeResult->replacements); - - return reductionTilingResult; + SCFTilingOptions options; + options.setLoopType(SCFTilingOptions::LoopType::ForOp); + options.setReductionTilingStrategy(SCFTilingOptions::ReductionTilingStrategy:: + PartialReductionOuterReduction); + options.setTileSizes(tileSizes); + + TilingInterface tilingInterfaceOp = + dyn_cast(op.getOperation()); + if (!tilingInterfaceOp) { + return b.notifyMatchFailure( + op, + "Operation implementing PartialReductionOpInterface should implement " + "TilingInterface"); + } + + return tileUsingSCF(b, tilingInterfaceOp, options); } //===----------------------------------------------------------------------===// @@ -1055,9 +1102,10 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b, /// Return the untiled producer whose slice is used in a tiled consumer. The /// method traverses the tile loop nest (`loops`) if needed, and returns the -/// `iter_args` of the outer most that is encountered. Traversing the iter_args -/// indicates that this is a destination operand of the consumer. If there was -/// no loop traversal needed, the second value of the returned tuple is empty. +/// `iter_args` of the outer most that is encountered. Traversing the +/// iter_args indicates that this is a destination operand of the consumer. If +/// there was no loop traversal needed, the second value of the returned tuple +/// is empty. static std::tuple> getUntiledProducerFromSliceSource(OpOperand *source, ArrayRef loops) { @@ -1115,8 +1163,8 @@ mlir::scf::tileAndFuseProducerOfSlice( Operation *clonedProducerOp = cloneOpAndUpdateDestinationArgs( rewriter, fusableProducerOp, clonedOpDestinationTensors); // 2d. Update the source of the candidateSlice to be the cloned producer. - // Easier to just clone the slice with different source since replacements - // and DCE of cloned ops becomes easier + // Easier to just clone the slice with different source since + // replacements and DCE of cloned ops becomes easier SmallVector candidateSliceOpOperands = llvm::to_vector(candidateSliceOp->getOperands()); candidateSliceOpOperands[0] = clonedProducerOp->getResult(resultNumber); @@ -1250,13 +1298,13 @@ FailureOr> mlir::scf::yieldReplacementForFusedProducer( failed(tilableOp.getIterationDomainTileFromResultTile( rewriter, sliceResultNumber, sliceOffset, sliceSizes, iterDomainOffset, iterDomainSizes))) { - // In theory, it is unnecessary to raise an error here. Actually although - // it fails to reconstruct the result tensor, it should not broke current - // fusion anyway. The reason why we must return failure currently is that - // the callback function `newYieldValuesFn` will be called after new init - // operand(s) has already been appended. It will take more refactoring to - // make sure the init operands are added consistently in the future. For - // more details, please refer to: + // In theory, it is unnecessary to raise an error here. Actually + // although it fails to reconstruct the result tensor, it should not + // broke current fusion anyway. The reason why we must return failure + // currently is that the callback function `newYieldValuesFn` will be + // called after new init operand(s) has already been appended. It will + // take more refactoring to make sure the init operands are added + // consistently in the future. For more details, please refer to: // https://github.com/llvm/llvm-project/pull/93144#discussion_r1643760814 return failure(); } @@ -1282,7 +1330,8 @@ FailureOr> mlir::scf::yieldReplacementForFusedProducer( } } - // d. create `extract_slice` for `iter_args` for DPS operation if necessary + // d. create `extract_slice` for `iter_args` for DPS operation if + // necessary if (auto tiledDestStyleOp = dyn_cast(tiledOwner)) { rewriter.setInsertionPoint(tiledDestStyleOp); @@ -1334,9 +1383,10 @@ class SliceTrackingListener : public RewriterBase::Listener { std::optional patterns); SliceTrackingListener() = default; - /// Adds the given list of operations to the worklist, and if present, applies - /// the list of `patterns` to the newly added operations. This only processes - /// the given operations and any newly inserted ones by the pattern set. + /// Adds the given list of operations to the worklist, and if present, + /// applies the list of `patterns` to the newly added operations. This only + /// processes the given operations and any newly inserted ones by the + /// pattern set. LogicalResult insertAndApplyPatterns(ArrayRef newOps); /// Add to the new operation worklist if it is an extract_slice. @@ -1357,7 +1407,8 @@ class SliceTrackingListener : public RewriterBase::Listener { std::deque worklist; private: - /// Optional pattern set to apply when adding new operations to the worklist. + /// Optional pattern set to apply when adding new operations to the + /// worklist. std::optional patterns = std::nullopt; }; @@ -1390,8 +1441,9 @@ void SliceTrackingListener::notifyOperationInserted( worklist.push_back(slice); } -// Scan the worklist for the given op and remove it if present. The expectation -// is for the worklist to be small and for removal to be relatively rare. +// Scan the worklist for the given op and remove it if present. The +// expectation is for the worklist to be small and for removal to be +// relatively rare. void SliceTrackingListener::removeOp(Operation *op) { if (!isa(op)) return; @@ -1445,17 +1497,18 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( auto &loops = tilingResult->loops; if (loops.empty()) { DenseMap replacements; - for (auto [origVal, replacement] : - llvm::zip_equal(consumer->getResults(), tilingResult->replacements)) { + for (auto [origVal, replacement] : llvm::zip_equal( + consumer->getResults(), tilingResult->mergeResult.replacements)) { replacements[origVal] = replacement; } return scf::SCFTileAndFuseResult{fusedProducers, tiledAndFusedOps, loops, replacements}; } - // To keep track of replacements for now just record the map from the original - // untiled value to the result number of the for loop. Since the loop gets - // potentially replaced during fusion, keeping the value directly wont work. + // To keep track of replacements for now just record the map from the + // original untiled value to the result number of the for loop. Since the + // loop gets potentially replaced during fusion, keeping the value directly + // wont work. DenseMap origValToResultNumber; for (auto [index, result] : llvm::enumerate(consumer->getResults())) { origValToResultNumber[result] = index; @@ -1463,11 +1516,11 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( // 2. Typically, the operands of the tiled operation are slices of the // operands of the untiled operation. These are expressed in IR using - // `tensor.extract_slice` operations with source being the operands of the - // untiled operation. Create a worklist of these `tensor.extract_slice` - // operations. If the producers of the source of the `tensor.extract_slice` - // can be tiled such that the tiled value is generated in-place, that - // effectively tiles + fuses the operations. + // `tensor.extract_slice` operations with source being the operands of + // the untiled operation. Create a worklist of these + // `tensor.extract_slice` operations. If the producers of the source of + // the `tensor.extract_slice` can be tiled such that the tiled value is + // generated in-place, that effectively tiles + fuses the operations. struct WorklistItem { tensor::ExtractSliceOp candidateSlice; SCFTileAndFuseOptions::ControlFnResult controlFnResult; @@ -1511,9 +1564,10 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( SmallVector worklistCandidates = fusedResult->generatedSlices; if (worklistItem.controlFnResult.yieldProducerReplacement) { - // Reconstruct and yield all opResult of fusableProducerOp by default. The - // caller can specific which one to yield by designating optional argument - // named `yieldResultNumber` of `yieldReplacementForFusedProducer`. + // Reconstruct and yield all opResult of fusableProducerOp by default. + // The caller can specific which one to yield by designating optional + // argument named `yieldResultNumber` of + // `yieldReplacementForFusedProducer`. Operation *fusableProducerOp = fusedResult->origProducer.getOwner(); FailureOr> newSlices = yieldReplacementForFusedProducer(rewriter, @@ -1582,8 +1636,8 @@ checkAssumptionForFusingConsumer(tensor::InsertSliceOp candidateSliceOp) { return success(); } -/// An utility to get the first user of the given loopOp. If any of user stay in -/// different block of loopOp, return failure. +/// An utility to get the first user of the given loopOp. If any of user stay +/// in different block of loopOp, return failure. static FailureOr getFirstUserOfLoop(Operation *loopOp) { if (!isa(loopOp)) return failure(); @@ -1616,11 +1670,11 @@ static FailureOr getFirstUserOfLoop(Operation *loopOp) { return firstUserOfLoop; } -/// This utility currently checks whether the first userOp of loop is NOT before -/// the last defineOp of consumer operand. Because that we need to move the -/// whole loop structure right before the `firstUserOfLoop`. This utility thus -/// helps ensuring that no invalid IR is formed, i.e. no backward slice of -/// consumerOp is dominated by the `firstUserOfLoop`. Saying that: +/// This utility currently checks whether the first userOp of loop is NOT +/// before the last defineOp of consumer operand. Because that we need to move +/// the whole loop structure right before the `firstUserOfLoop`. This utility +/// thus helps ensuring that no invalid IR is formed, i.e. no backward slice +/// of consumerOp is dominated by the `firstUserOfLoop`. Saying that: /// /// ``` /// %0 = scf.for() { @@ -1634,9 +1688,9 @@ static FailureOr getFirstUserOfLoop(Operation *loopOp) { /// %3 = consumerOp(%2) /// ``` /// -/// If the `firstUserOfLoop` is before `lastDefOfConsumerOperand`, then it would -/// be invalid to move the `loopOp` right before the `firstUserOfLoop`, a.k.a. -/// use-def chain violation: +/// If the `firstUserOfLoop` is before `lastDefOfConsumerOperand`, then it +/// would be invalid to move the `loopOp` right before the `firstUserOfLoop`, +/// a.k.a. use-def chain violation: /// /// ``` /// %0:2 = scf.for() { @@ -1650,10 +1704,10 @@ static FailureOr getFirstUserOfLoop(Operation *loopOp) { /// /// @param loopOp: loop operation /// @param consumerOp: consumer operation -/// @param reorderOperations: the flag controls whether to reorder the backward -/// slice w.r.t. the defineOp of `consumerOp` operands. -/// @return: computed backward slice of consumerOp, but excluding those already -/// dominates `firstUserOfLoop`. +/// @param reorderOperations: the flag controls whether to reorder the +/// backward slice w.r.t. the defineOp of `consumerOp` operands. +/// @return: computed backward slice of consumerOp, but excluding those +/// already dominates `firstUserOfLoop`. static FailureOr> checkAssumptionForLoop(Operation *loopOp, Operation *consumerOp, bool reorderOperations) { @@ -1713,8 +1767,8 @@ static FailureOr getConsumerFromLoopUses(RewriterBase &rewriter, if (!isa(consumerOp) || !isa(consumerOp)) { // TODO: We have to init result of consumer before scf.for, use - // DestinationStyleOpInterface to get result shape from init for now. Add - // support for other op such as op has InferTypeOpInterface. + // DestinationStyleOpInterface to get result shape from init for now. + // Add support for other op such as op has InferTypeOpInterface. continue; } // Step 2. Check if user stay in the same block. @@ -1729,7 +1783,8 @@ static FailureOr getConsumerFromLoopUses(RewriterBase &rewriter, checkAssumptionForLoop(loopOp, consumerOp, true); if (failed(slice)) continue; - // Step 5. If backward sice is not empty, move them before firstUserOfLoop. + // Step 5. If backward sice is not empty, move them before + // firstUserOfLoop. if (!slice->empty()) { mlir::topologicalSort(*slice); FailureOr firstUserOfLoop = getFirstUserOfLoop(loopOp); @@ -1743,8 +1798,8 @@ static FailureOr getConsumerFromLoopUses(RewriterBase &rewriter, return failure(); } -/// Find the perfectly nested loops outside of given loop(included) sorted from -/// outer to inner. +/// Find the perfectly nested loops outside of given loop(included) sorted +/// from outer to inner. /// /// E.g. /// @@ -1997,10 +2052,11 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, } // 10. Try to get iter domain position from input position. Use - // clonedConsumerOp instead of tiledConsumerOp, because the iteration domain - // may require index computation based on the result size. The sizes and - // offsets should be the same either way, but using tiledConsumerOp could - // lead to some chained unnecessary extra index computation. + // clonedConsumerOp instead of tiledConsumerOp, because the iteration + // domain may require index computation based on the result size. The + // sizes and offsets should be the same either way, but using + // tiledConsumerOp could lead to some chained unnecessary extra index + // computation. SmallVector iterDomainOffsets, iterDomainSizes; if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTile( rewriter, operandNumber, offsets, sizes, iterDomainOffsets, @@ -2067,7 +2123,8 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, "unable to add new inits to nest loop"); } - // 15. Replace the result of scf loop and consumer op with new loop's results. + // 15. Replace the result of scf loop and consumer op with new loop's + // results. for (auto &&[oldResult, newResult] : llvm::zip( consumerOp->getResults(), diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp index 5e903e378daf82..7380b766935ffe 100644 --- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp +++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp @@ -250,7 +250,8 @@ applyTileToAll(RewriterBase &rewriter, Operation *transformOp, return failure(); // Perform the replacement of tiled and fused values. - rewriter.replaceOp(tilingInterfaceOp, tiledResults->replacements); + rewriter.replaceOp(tilingInterfaceOp, + tiledResults->mergeResult.replacements); // Report back the relevant handles to the transform op. tiledOps.push_back(tiledResults->tiledOps.front()); From fbc18b85d6ce5ab6489a2b08f9b38d446fe9d6f6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 18 Dec 2024 13:32:53 +0000 Subject: [PATCH 32/37] Revert "[VectorCombine] Combine scalar fneg with insert/extract to vector fneg when length is different" (#120422) Reverts llvm/llvm-project#115209 - investigating a reported regression --- .../Transforms/Vectorize/VectorCombine.cpp | 34 +--- .../VectorCombine/X86/extract-fneg-insert.ll | 154 ------------------ 2 files changed, 8 insertions(+), 180 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 5254fab1cdc914..791006c48b5ddf 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -666,10 +666,9 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))) return false; + // TODO: We could handle this with a length-changing shuffle. auto *VecTy = cast(I.getType()); - auto *ScalarTy = VecTy->getScalarType(); - auto *SrcVecTy = dyn_cast(SrcVec->getType()); - if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType()) + if (SrcVec->getType() != VecTy) return false; // Ignore bogus insert/extract index. @@ -683,6 +682,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { SmallVector Mask(NumElts); std::iota(Mask.begin(), Mask.end(), 0); Mask[Index] = Index + NumElts; + + Type *ScalarTy = VecTy->getScalarType(); InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); @@ -697,33 +698,14 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); - bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; - // If the lengths of the two vectors are not equal, - // we need to add a length-change vector. Add this cost. - SmallVector SrcMask; - if (NeedLenChg) { - SrcMask.assign(NumElts, PoisonMaskElem); - SrcMask[Index] = Index; - NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - SrcVecTy, SrcMask, CostKind); - } - if (NewCost > OldCost) return false; - Value *NewShuf; - // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index + // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> + // shuffle DestVec, (fneg SrcVec), Mask Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg); - if (NeedLenChg) { - // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask - Value *LenChgShuf = Builder.CreateShuffleVector(SrcVec, SrcMask); - NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask); - } else { - // shuffle DestVec, (fneg SrcVec), Mask - NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); - } - - replaceValue(I, *NewShuf); + Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask); + replaceValue(I, *Shuf); return true; } diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll index 83f94ba46a072f..df5fcdb7beb656 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll @@ -18,19 +18,6 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } -define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext0_v2f32v4f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0 -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 0 - %n = fneg float %e - %r = insertelement <4 x float> %y, float %n, i32 0 - ret <4 x float> %r -} - ; Eliminating extract/insert is profitable. define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { @@ -45,19 +32,6 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } -define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext2_v2f32v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 2 - %n = fneg float %e - %r = insertelement <4 x float> %y, float %n, i32 2 - ret <4 x float> %r -} - ; Eliminating extract/insert is still profitable. Flags propagate. define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { @@ -72,25 +46,6 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } -define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) { -; SSE-LABEL: @ext1_v2f64v4f64( -; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1 -; SSE-NEXT: ret <4 x double> [[R]] -; -; AVX-LABEL: @ext1_v2f64v4f64( -; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[X]], <2 x double> poison, <4 x i32> -; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> -; AVX-NEXT: ret <4 x double> [[R]] -; - %e = extractelement <2 x double> %x, i32 1 - %n = fneg nsz double %e - %r = insertelement <4 x double> %y, double %n, i32 1 - ret <4 x double> %r -} - ; The vector fneg would cost twice as much as the scalar op with SSE, ; so we don't transform there (the shuffle would also be more expensive). @@ -112,19 +67,6 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } -define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7 -; CHECK-NEXT: ret <8 x float> [[R]] -; - %e = extractelement <4 x float> %x, i32 3 - %n = fneg float %e - %r = insertelement <8 x float> %y, float %n, i32 7 - ret <8 x float> %r -} - ; Same as above with an extra use of the extracted element. define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { @@ -149,21 +91,6 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } -define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32_use1( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: call void @use(float [[E]]) -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 -; CHECK-NEXT: ret <8 x float> [[R]] -; - %e = extractelement <4 x float> %x, i32 3 - call void @use(float %e) - %n = fneg float %e - %r = insertelement <8 x float> %y, float %n, i32 3 - ret <8 x float> %r -} - ; Negative test - the transform is likely not profitable if the fneg has another use. define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { @@ -181,21 +108,6 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) { ret <8 x float> %r } -define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) { -; CHECK-LABEL: @ext7_v4f32v8f32_use2( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: call void @use(float [[N]]) -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3 -; CHECK-NEXT: ret <8 x float> [[R]] -; - %e = extractelement <4 x float> %x, i32 3 - %n = fneg float %e - call void @use(float %n) - %r = insertelement <8 x float> %y, float %n, i32 3 - ret <8 x float> %r -} - ; Negative test - can't convert variable index to a shuffle. define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) { @@ -211,19 +123,6 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 % ret <2 x double> %r } -define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) { -; CHECK-LABEL: @ext_index_var_v2f64v4f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]] -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]] -; CHECK-NEXT: ret <4 x double> [[R]] -; - %e = extractelement <2 x double> %x, i32 %index - %n = fneg nsz double %e - %r = insertelement <4 x double> %y, double %n, i32 %index - ret <4 x double> %r -} - ; Negative test - require same extract/insert index for simple shuffle. ; TODO: We could handle this by adjusting the cost calculation. @@ -240,33 +139,6 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) { ret <2 x double> %r } -; Negative test - extract from an index greater than the vector width of the destination -define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @ext3_v4f64v2f64( -; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1 -; CHECK-NEXT: ret <2 x double> [[R]] -; - %e = extractelement <4 x double> %x, i32 3 - %n = fneg nsz double %e - %r = insertelement <2 x double> %y, double %n, i32 1 - ret <2 x double> %r -} - -define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) { -; CHECK-LABEL: @ext1_v2f64v4f64_ins0( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1 -; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0 -; CHECK-NEXT: ret <4 x double> [[R]] -; - %e = extractelement <2 x double> %x, i32 1 - %n = fneg nsz double %e - %r = insertelement <4 x double> %y, double %n, i32 0 - ret <4 x double> %r -} - ; Negative test - avoid changing poison ops define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { @@ -282,19 +154,6 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) { ret <4 x float> %r } -define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext12_v2f32v4f32( -; CHECK-NEXT: [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6 -; CHECK-NEXT: [[N:%.*]] = fneg float [[E]] -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12 -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 6 - %n = fneg float %e - %r = insertelement <4 x float> %y, float %n, i32 12 - ret <4 x float> %r -} - ; This used to crash because we assumed matching a true, unary fneg instruction. define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) { @@ -322,16 +181,3 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) { %r = insertelement <2 x float> %y, float %s, i32 1 ret <2 x float> %r } - -define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) { -; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf( -; CHECK-NEXT: [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[X]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] -; - %e = extractelement <2 x float> %x, i32 1 - %s = fsub nsz nnan float 0.0, %e - %r = insertelement <4 x float> %y, float %s, i32 1 - ret <4 x float> %r -} From 0e8d022ffe008dd7afffa5140c4d87ce3d77902d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 18 Dec 2024 14:47:16 +0000 Subject: [PATCH 33/37] [VPlan] Handle exit phis with multiple operands in addUsersInExitBlocks. (#120260) Currently the addUsersInExitBlocks incorrectly assumes exit phis only have a single operand, which may not be the case for loops with early exits when they share a common exit block. Also further relax the assertion in fixupIVUsers to allow exit values if they come from theloop latch/middle.block. PR: https://github.com/llvm/llvm-project/pull/120260 --- .../Transforms/Vectorize/LoopVectorize.cpp | 77 ++++++++----------- .../LoopVectorize/early_exit_legality.ll | 4 +- .../single_early_exit_live_outs.ll | 44 +++++++++-- 3 files changed, 71 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a6acc710a34c89..a8511483e00fbe 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2905,8 +2905,17 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, } } - assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) && - "Expected a single exit block for escaping values"); + assert((MissingVals.empty() || + all_of(MissingVals, + [MiddleBlock, this](const std::pair &P) { + return all_of( + predecessors(cast(P.first)->getParent()), + [MiddleBlock, this](BasicBlock *Pred) { + return Pred == MiddleBlock || + Pred == OrigLoop->getLoopLatch(); + }); + })) && + "Expected escaping values from latch/middle.block only"); for (auto &I : MissingVals) { PHINode *PHI = cast(I.first); @@ -9049,22 +9058,23 @@ addUsersInExitBlocks(VPlan &Plan, // Introduce extract for exiting values and update the VPIRInstructions // modeling the corresponding LCSSA phis. for (VPIRInstruction *ExitIRI : ExitUsersToFix) { - VPValue *V = ExitIRI->getOperand(0); - // Pass live-in values used by exit phis directly through to their users in - // the exit block. - if (V->isLiveIn()) - continue; + for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) { + // Pass live-in values used by exit phis directly through to their users + // in the exit block. + if (Op->isLiveIn()) + continue; - // Currently only live-ins can be used by exit values from blocks not - // exiting via the vector latch through to the middle block. - if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) - return false; + // Currently only live-ins can be used by exit values from blocks not + // exiting via the vector latch through to the middle block. + if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB) + return false; - LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); - VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, - {V, Plan.getOrAddLiveIn(ConstantInt::get( - IntegerType::get(Ctx, 32), 1))}); - ExitIRI->setOperand(0, Ext); + LLVMContext &Ctx = ExitIRI->getInstruction().getContext(); + VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd, + {Op, Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(Ctx, 32), 1))}); + ExitIRI->setOperand(Idx, Ext); + } } return true; } @@ -10226,36 +10236,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { return false; } - if (LVL.hasUncountableEarlyExit()) { - if (!EnableEarlyExitVectorization) { - reportVectorizationFailure("Auto-vectorization of loops with uncountable " - "early exit is not enabled", - "UncountableEarlyExitLoopsDisabled", ORE, L); - return false; - } - - // In addUsersInExitBlocks we already bail out if there is an outside use - // of a loop-defined variable, but it ignores induction variables which are - // handled by InnerLoopVectorizer::fixupIVUsers. We need to bail out if we - // encounter induction variables too otherwise fixupIVUsers will crash. - BasicBlock *LoopLatch = L->getLoopLatch(); - for (const auto &Induction : LVL.getInductionVars()) { - PHINode *Ind = Induction.first; - Instruction *IndUpdate = - cast(Ind->getIncomingValueForBlock(LoopLatch)); - for (Instruction *I : {cast(Ind), IndUpdate}) { - for (User *U : I->users()) { - Instruction *UI = cast(U); - if (!L->contains(UI)) { - reportVectorizationFailure( - "Auto-vectorization of loops with uncountable early exits and " - "outside uses of induction variables unsupported", - "UncountableEarlyExitLoopIndLiveOutsUnsupported", ORE, L); - return false; - } - } - } - } + if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) { + reportVectorizationFailure("Auto-vectorization of loops with uncountable " + "early exit is not enabled", + "UncountableEarlyExitLoopsDisabled", ORE, L); + return false; } // Entrance to the VPlan-native vectorization path. Outer loops are processed diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll index ac78c40ec92c6c..8df0eaec6a8c9d 100644 --- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll +++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll @@ -49,7 +49,7 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK-NEXT: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exits and outside uses of induction variables unsupported +; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet. entry: %p1 = alloca [1024 x i8] %p2 = alloca [1024 x i8] @@ -141,7 +141,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit' ; CHECK: LV: Found an early exit loop with symbolic max backedge taken count: 63 ; CHECK-NEXT: LV: We can vectorize this loop! -; CHECK: LV: Not vectorizing: Auto-vectorization of loops with uncountable early exits and outside uses of induction variables unsupported +; CHECK: LV: Not vectorizing: Some exit values in loop with uncountable exit not supported yet. entry: %p1 = alloca [1024 x i8] call void @init_mem(ptr %p1, i64 1024) diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 7f00e77b9169dd..085438aa80f246 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization | FileCheck %s +; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -force-vector-width=4 | FileCheck %s declare void @init_mem(ptr, i64); @@ -527,24 +527,50 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i8], align 1 ; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024) ; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 +; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.split: +; CHECK-NEXT: br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[LOOP_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] ; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]] -; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]] ; CHECK: loop.inc: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.early.exit: -; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ] +; CHECK-NEXT: [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ 67, [[MIDDLE_SPLIT]] ] ; CHECK-NEXT: ret i64 [[RETVAL1]] ; CHECK: loop.end: -; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ] +; CHECK-NEXT: [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RETVAL2]] ; entry: @@ -995,3 +1021,9 @@ declare i32 @foo(i32) readonly declare @foo_vec() attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. From 6f0e9c4a5611d21cbcac4bb4f16dc90674838e1e Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 18 Dec 2024 15:02:14 +0000 Subject: [PATCH 34/37] [OpenMP][Clang] Migrate OpenMP UserDefinedMapper from Clang to OMPIRBuilder (#110001) This patch migrates the OpenMP UserDefinedMapper codegen from Clang to the OpenMPIRBuilder. I will be adding further patches in the near future so that OpenMP dialect in MLIR can make use of these. --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 366 +++--------------- clang/lib/CodeGen/CGOpenMPRuntime.h | 9 - clang/test/OpenMP/declare_mapper_codegen.cpp | 48 +-- clang/test/OpenMP/target_map_names.cpp | 4 +- clang/test/OpenMP/target_map_names_attr.cpp | 4 +- ...target_map_nest_defalut_mapper_codegen.cpp | 144 +++---- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 61 +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 291 ++++++++++++++ 8 files changed, 461 insertions(+), 466 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 90809ef90858c5..30c3834de139c3 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9042,337 +9042,69 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D, return; ASTContext &C = CGM.getContext(); QualType Ty = D->getType(); - QualType PtrTy = C.getPointerType(Ty).withRestrict(); - QualType Int64Ty = C.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/true); auto *MapperVarDecl = cast(cast(D->getMapperVarRef())->getDecl()); - SourceLocation Loc = D->getLocation(); CharUnits ElementSize = C.getTypeSizeInChars(Ty); llvm::Type *ElemTy = CGM.getTypes().ConvertTypeForMem(Ty); - // Prepare mapper function arguments and attributes. - ImplicitParamDecl HandleArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamKind::Other); - ImplicitParamDecl BaseArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy, - ImplicitParamKind::Other); - ImplicitParamDecl BeginArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, - C.VoidPtrTy, ImplicitParamKind::Other); - ImplicitParamDecl SizeArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int64Ty, - ImplicitParamKind::Other); - ImplicitParamDecl TypeArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int64Ty, - ImplicitParamKind::Other); - ImplicitParamDecl NameArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.VoidPtrTy, - ImplicitParamKind::Other); - FunctionArgList Args; - Args.push_back(&HandleArg); - Args.push_back(&BaseArg); - Args.push_back(&BeginArg); - Args.push_back(&SizeArg); - Args.push_back(&TypeArg); - Args.push_back(&NameArg); - const CGFunctionInfo &FnInfo = - CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - llvm::FunctionType *FnTy = CGM.getTypes().GetFunctionType(FnInfo); + CodeGenFunction MapperCGF(CGM); + MappableExprsHandler::MapCombinedInfoTy CombinedInfo; + auto PrivatizeAndGenMapInfoCB = + [&](llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP, llvm::Value *PtrPHI, + llvm::Value *BeginArg) -> llvm::OpenMPIRBuilder::MapInfosTy & { + MapperCGF.Builder.restoreIP(CodeGenIP); + + // Privatize the declared variable of mapper to be the current array + // element. + Address PtrCurrent( + PtrPHI, ElemTy, + Address(BeginArg, MapperCGF.VoidPtrTy, CGM.getPointerAlign()) + .getAlignment() + .alignmentOfArrayElement(ElementSize)); + CodeGenFunction::OMPPrivateScope Scope(MapperCGF); + Scope.addPrivate(MapperVarDecl, PtrCurrent); + (void)Scope.Privatize(); + + // Get map clause information. + MappableExprsHandler MEHandler(*D, MapperCGF); + MEHandler.generateAllInfoForMapper(CombinedInfo, OMPBuilder); + + auto FillInfoMap = [&](MappableExprsHandler::MappingExprInfo &MapExpr) { + return emitMappingInformation(MapperCGF, OMPBuilder, MapExpr); + }; + if (CGM.getCodeGenOpts().getDebugInfo() != + llvm::codegenoptions::NoDebugInfo) { + CombinedInfo.Names.resize(CombinedInfo.Exprs.size()); + llvm::transform(CombinedInfo.Exprs, CombinedInfo.Names.begin(), + FillInfoMap); + } + + return CombinedInfo; + }; + + auto CustomMapperCB = [&](unsigned I, llvm::Function **MapperFunc) { + if (CombinedInfo.Mappers[I]) { + // Call the corresponding mapper function. + *MapperFunc = getOrCreateUserDefinedMapperFunc( + cast(CombinedInfo.Mappers[I])); + assert(*MapperFunc && "Expect a valid mapper function is available."); + return true; + } + return false; + }; + SmallString<64> TyStr; llvm::raw_svector_ostream Out(TyStr); CGM.getCXXABI().getMangleContext().mangleCanonicalTypeName(Ty, Out); std::string Name = getName({"omp_mapper", TyStr, D->getName()}); - auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage, - Name, &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, FnInfo); - Fn->removeFnAttr(llvm::Attribute::OptimizeNone); - // Start the mapper function code generation. - CodeGenFunction MapperCGF(CGM); - MapperCGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, Loc, Loc); - // Compute the starting and end addresses of array elements. - llvm::Value *Size = MapperCGF.EmitLoadOfScalar( - MapperCGF.GetAddrOfLocalVar(&SizeArg), /*Volatile=*/false, - C.getPointerType(Int64Ty), Loc); - // Prepare common arguments for array initiation and deletion. - llvm::Value *Handle = MapperCGF.EmitLoadOfScalar( - MapperCGF.GetAddrOfLocalVar(&HandleArg), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); - llvm::Value *BaseIn = MapperCGF.EmitLoadOfScalar( - MapperCGF.GetAddrOfLocalVar(&BaseArg), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); - llvm::Value *BeginIn = MapperCGF.EmitLoadOfScalar( - MapperCGF.GetAddrOfLocalVar(&BeginArg), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); - // Convert the size in bytes into the number of array elements. - Size = MapperCGF.Builder.CreateExactUDiv( - Size, MapperCGF.Builder.getInt64(ElementSize.getQuantity())); - llvm::Value *PtrBegin = MapperCGF.Builder.CreateBitCast( - BeginIn, CGM.getTypes().ConvertTypeForMem(PtrTy)); - llvm::Value *PtrEnd = MapperCGF.Builder.CreateGEP(ElemTy, PtrBegin, Size); - llvm::Value *MapType = MapperCGF.EmitLoadOfScalar( - MapperCGF.GetAddrOfLocalVar(&TypeArg), /*Volatile=*/false, - C.getPointerType(Int64Ty), Loc); - llvm::Value *MapName = MapperCGF.EmitLoadOfScalar( - MapperCGF.GetAddrOfLocalVar(&NameArg), - /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); - - // Emit array initiation if this is an array section and \p MapType indicates - // that memory allocation is required. - llvm::BasicBlock *HeadBB = MapperCGF.createBasicBlock("omp.arraymap.head"); - emitUDMapperArrayInitOrDel(MapperCGF, Handle, BaseIn, BeginIn, Size, MapType, - MapName, ElementSize, HeadBB, /*IsInit=*/true); - - // Emit a for loop to iterate through SizeArg of elements and map all of them. - - // Emit the loop header block. - MapperCGF.EmitBlock(HeadBB); - llvm::BasicBlock *BodyBB = MapperCGF.createBasicBlock("omp.arraymap.body"); - llvm::BasicBlock *DoneBB = MapperCGF.createBasicBlock("omp.done"); - // Evaluate whether the initial condition is satisfied. - llvm::Value *IsEmpty = - MapperCGF.Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty"); - MapperCGF.Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); - llvm::BasicBlock *EntryBB = MapperCGF.Builder.GetInsertBlock(); - - // Emit the loop body block. - MapperCGF.EmitBlock(BodyBB); - llvm::BasicBlock *LastBB = BodyBB; - llvm::PHINode *PtrPHI = MapperCGF.Builder.CreatePHI( - PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent"); - PtrPHI->addIncoming(PtrBegin, EntryBB); - Address PtrCurrent(PtrPHI, ElemTy, - MapperCGF.GetAddrOfLocalVar(&BeginArg) - .getAlignment() - .alignmentOfArrayElement(ElementSize)); - // Privatize the declared variable of mapper to be the current array element. - CodeGenFunction::OMPPrivateScope Scope(MapperCGF); - Scope.addPrivate(MapperVarDecl, PtrCurrent); - (void)Scope.Privatize(); - // Get map clause information. Fill up the arrays with all mapped variables. - MappableExprsHandler::MapCombinedInfoTy Info; - MappableExprsHandler MEHandler(*D, MapperCGF); - MEHandler.generateAllInfoForMapper(Info, OMPBuilder); - - // Call the runtime API __tgt_mapper_num_components to get the number of - // pre-existing components. - llvm::Value *OffloadingArgs[] = {Handle}; - llvm::Value *PreviousSize = MapperCGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___tgt_mapper_num_components), - OffloadingArgs); - llvm::Value *ShiftedPreviousSize = MapperCGF.Builder.CreateShl( - PreviousSize, - MapperCGF.Builder.getInt64(MappableExprsHandler::getFlagMemberOffset())); - - // Fill up the runtime mapper handle for all components. - for (unsigned I = 0; I < Info.BasePointers.size(); ++I) { - llvm::Value *CurBaseArg = MapperCGF.Builder.CreateBitCast( - Info.BasePointers[I], CGM.getTypes().ConvertTypeForMem(C.VoidPtrTy)); - llvm::Value *CurBeginArg = MapperCGF.Builder.CreateBitCast( - Info.Pointers[I], CGM.getTypes().ConvertTypeForMem(C.VoidPtrTy)); - llvm::Value *CurSizeArg = Info.Sizes[I]; - llvm::Value *CurNameArg = - (CGM.getCodeGenOpts().getDebugInfo() == - llvm::codegenoptions::NoDebugInfo) - ? llvm::ConstantPointerNull::get(CGM.VoidPtrTy) - : emitMappingInformation(MapperCGF, OMPBuilder, Info.Exprs[I]); - - // Extract the MEMBER_OF field from the map type. - llvm::Value *OriMapType = MapperCGF.Builder.getInt64( - static_cast>( - Info.Types[I])); - llvm::Value *MemberMapType = - MapperCGF.Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize); - - // Combine the map type inherited from user-defined mapper with that - // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM - // bits of the \a MapType, which is the input argument of the mapper - // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM - // bits of MemberMapType. - // [OpenMP 5.0], 1.2.6. map-type decay. - // | alloc | to | from | tofrom | release | delete - // ---------------------------------------------------------- - // alloc | alloc | alloc | alloc | alloc | release | delete - // to | alloc | to | alloc | to | release | delete - // from | alloc | alloc | from | from | release | delete - // tofrom | alloc | to | from | tofrom | release | delete - llvm::Value *LeftToFrom = MapperCGF.Builder.CreateAnd( - MapType, - MapperCGF.Builder.getInt64( - static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_TO | - OpenMPOffloadMappingFlags::OMP_MAP_FROM))); - llvm::BasicBlock *AllocBB = MapperCGF.createBasicBlock("omp.type.alloc"); - llvm::BasicBlock *AllocElseBB = - MapperCGF.createBasicBlock("omp.type.alloc.else"); - llvm::BasicBlock *ToBB = MapperCGF.createBasicBlock("omp.type.to"); - llvm::BasicBlock *ToElseBB = MapperCGF.createBasicBlock("omp.type.to.else"); - llvm::BasicBlock *FromBB = MapperCGF.createBasicBlock("omp.type.from"); - llvm::BasicBlock *EndBB = MapperCGF.createBasicBlock("omp.type.end"); - llvm::Value *IsAlloc = MapperCGF.Builder.CreateIsNull(LeftToFrom); - MapperCGF.Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB); - // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM. - MapperCGF.EmitBlock(AllocBB); - llvm::Value *AllocMapType = MapperCGF.Builder.CreateAnd( - MemberMapType, - MapperCGF.Builder.getInt64( - ~static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_TO | - OpenMPOffloadMappingFlags::OMP_MAP_FROM))); - MapperCGF.Builder.CreateBr(EndBB); - MapperCGF.EmitBlock(AllocElseBB); - llvm::Value *IsTo = MapperCGF.Builder.CreateICmpEQ( - LeftToFrom, - MapperCGF.Builder.getInt64( - static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_TO))); - MapperCGF.Builder.CreateCondBr(IsTo, ToBB, ToElseBB); - // In case of to, clear OMP_MAP_FROM. - MapperCGF.EmitBlock(ToBB); - llvm::Value *ToMapType = MapperCGF.Builder.CreateAnd( - MemberMapType, - MapperCGF.Builder.getInt64( - ~static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_FROM))); - MapperCGF.Builder.CreateBr(EndBB); - MapperCGF.EmitBlock(ToElseBB); - llvm::Value *IsFrom = MapperCGF.Builder.CreateICmpEQ( - LeftToFrom, - MapperCGF.Builder.getInt64( - static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_FROM))); - MapperCGF.Builder.CreateCondBr(IsFrom, FromBB, EndBB); - // In case of from, clear OMP_MAP_TO. - MapperCGF.EmitBlock(FromBB); - llvm::Value *FromMapType = MapperCGF.Builder.CreateAnd( - MemberMapType, - MapperCGF.Builder.getInt64( - ~static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_TO))); - // In case of tofrom, do nothing. - MapperCGF.EmitBlock(EndBB); - LastBB = EndBB; - llvm::PHINode *CurMapType = - MapperCGF.Builder.CreatePHI(CGM.Int64Ty, 4, "omp.maptype"); - CurMapType->addIncoming(AllocMapType, AllocBB); - CurMapType->addIncoming(ToMapType, ToBB); - CurMapType->addIncoming(FromMapType, FromBB); - CurMapType->addIncoming(MemberMapType, ToElseBB); - - llvm::Value *OffloadingArgs[] = {Handle, CurBaseArg, CurBeginArg, - CurSizeArg, CurMapType, CurNameArg}; - if (Info.Mappers[I]) { - // Call the corresponding mapper function. - llvm::Function *MapperFunc = getOrCreateUserDefinedMapperFunc( - cast(Info.Mappers[I])); - assert(MapperFunc && "Expect a valid mapper function is available."); - MapperCGF.EmitNounwindRuntimeCall(MapperFunc, OffloadingArgs); - } else { - // Call the runtime API __tgt_push_mapper_component to fill up the runtime - // data structure. - MapperCGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___tgt_push_mapper_component), - OffloadingArgs); - } - } - - // Update the pointer to point to the next element that needs to be mapped, - // and check whether we have mapped all elements. - llvm::Value *PtrNext = MapperCGF.Builder.CreateConstGEP1_32( - ElemTy, PtrPHI, /*Idx0=*/1, "omp.arraymap.next"); - PtrPHI->addIncoming(PtrNext, LastBB); - llvm::Value *IsDone = - MapperCGF.Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone"); - llvm::BasicBlock *ExitBB = MapperCGF.createBasicBlock("omp.arraymap.exit"); - MapperCGF.Builder.CreateCondBr(IsDone, ExitBB, BodyBB); - - MapperCGF.EmitBlock(ExitBB); - // Emit array deletion if this is an array section and \p MapType indicates - // that deletion is required. - emitUDMapperArrayInitOrDel(MapperCGF, Handle, BaseIn, BeginIn, Size, MapType, - MapName, ElementSize, DoneBB, /*IsInit=*/false); - - // Emit the function exit block. - MapperCGF.EmitBlock(DoneBB, /*IsFinished=*/true); - MapperCGF.FinishFunction(); - UDMMap.try_emplace(D, Fn); + auto *NewFn = OMPBuilder.emitUserDefinedMapper(PrivatizeAndGenMapInfoCB, + ElemTy, Name, CustomMapperCB); + UDMMap.try_emplace(D, NewFn); if (CGF) FunctionUDMMap[CGF->CurFn].push_back(D); } -/// Emit the array initialization or deletion portion for user-defined mapper -/// code generation. First, it evaluates whether an array section is mapped and -/// whether the \a MapType instructs to delete this section. If \a IsInit is -/// true, and \a MapType indicates to not delete this array, array -/// initialization code is generated. If \a IsInit is false, and \a MapType -/// indicates to not this array, array deletion code is generated. -void CGOpenMPRuntime::emitUDMapperArrayInitOrDel( - CodeGenFunction &MapperCGF, llvm::Value *Handle, llvm::Value *Base, - llvm::Value *Begin, llvm::Value *Size, llvm::Value *MapType, - llvm::Value *MapName, CharUnits ElementSize, llvm::BasicBlock *ExitBB, - bool IsInit) { - StringRef Prefix = IsInit ? ".init" : ".del"; - - // Evaluate if this is an array section. - llvm::BasicBlock *BodyBB = - MapperCGF.createBasicBlock(getName({"omp.array", Prefix})); - llvm::Value *IsArray = MapperCGF.Builder.CreateICmpSGT( - Size, MapperCGF.Builder.getInt64(1), "omp.arrayinit.isarray"); - llvm::Value *DeleteBit = MapperCGF.Builder.CreateAnd( - MapType, - MapperCGF.Builder.getInt64( - static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_DELETE))); - llvm::Value *DeleteCond; - llvm::Value *Cond; - if (IsInit) { - // base != begin? - llvm::Value *BaseIsBegin = MapperCGF.Builder.CreateICmpNE(Base, Begin); - // IsPtrAndObj? - llvm::Value *PtrAndObjBit = MapperCGF.Builder.CreateAnd( - MapType, - MapperCGF.Builder.getInt64( - static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ))); - PtrAndObjBit = MapperCGF.Builder.CreateIsNotNull(PtrAndObjBit); - BaseIsBegin = MapperCGF.Builder.CreateAnd(BaseIsBegin, PtrAndObjBit); - Cond = MapperCGF.Builder.CreateOr(IsArray, BaseIsBegin); - DeleteCond = MapperCGF.Builder.CreateIsNull( - DeleteBit, getName({"omp.array", Prefix, ".delete"})); - } else { - Cond = IsArray; - DeleteCond = MapperCGF.Builder.CreateIsNotNull( - DeleteBit, getName({"omp.array", Prefix, ".delete"})); - } - Cond = MapperCGF.Builder.CreateAnd(Cond, DeleteCond); - MapperCGF.Builder.CreateCondBr(Cond, BodyBB, ExitBB); - - MapperCGF.EmitBlock(BodyBB); - // Get the array size by multiplying element size and element number (i.e., \p - // Size). - llvm::Value *ArraySize = MapperCGF.Builder.CreateNUWMul( - Size, MapperCGF.Builder.getInt64(ElementSize.getQuantity())); - // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves - // memory allocation/deletion purpose only. - llvm::Value *MapTypeArg = MapperCGF.Builder.CreateAnd( - MapType, - MapperCGF.Builder.getInt64( - ~static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_TO | - OpenMPOffloadMappingFlags::OMP_MAP_FROM))); - MapTypeArg = MapperCGF.Builder.CreateOr( - MapTypeArg, - MapperCGF.Builder.getInt64( - static_cast>( - OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT))); - - // Call the runtime API __tgt_push_mapper_component to fill up the runtime - // data structure. - llvm::Value *OffloadingArgs[] = {Handle, Base, Begin, - ArraySize, MapTypeArg, MapName}; - MapperCGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___tgt_push_mapper_component), - OffloadingArgs); -} - llvm::Function *CGOpenMPRuntime::getOrCreateUserDefinedMapperFunc( const OMPDeclareMapperDecl *D) { auto I = UDMMap.find(D); diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 56d502d92806eb..8ab5ee70a19fa2 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -559,15 +559,6 @@ class CGOpenMPRuntime { llvm::Value *Ctor, llvm::Value *CopyCtor, llvm::Value *Dtor, SourceLocation Loc); - /// Emit the array initialization or deletion portion for user-defined mapper - /// code generation. - void emitUDMapperArrayInitOrDel(CodeGenFunction &MapperCGF, - llvm::Value *Handle, llvm::Value *BasePtr, - llvm::Value *Ptr, llvm::Value *Size, - llvm::Value *MapType, llvm::Value *MapName, - CharUnits ElementSize, - llvm::BasicBlock *ExitBB, bool IsInit); - struct TaskResultTy { llvm::Value *NewTask = nullptr; llvm::Function *TaskEntry = nullptr; diff --git a/clang/test/OpenMP/declare_mapper_codegen.cpp b/clang/test/OpenMP/declare_mapper_codegen.cpp index d2954b7a748217..f9da3d97766d96 100644 --- a/clang/test/OpenMP/declare_mapper_codegen.cpp +++ b/clang/test/OpenMP/declare_mapper_codegen.cpp @@ -86,19 +86,9 @@ class C { #pragma omp declare mapper(id: C s) map(s.a, s.b[0:2]) -// CK0: define {{.*}}void [[MPRFUNC:@[.]omp_mapper[.].*C[.]id]](ptr{{.*}}, ptr{{.*}}, ptr{{.*}}, i64{{.*}}, i64{{.*}}, ptr{{.*}}) -// CK0: store ptr %{{[^,]+}}, ptr [[HANDLEADDR:%[^,]+]] -// CK0: store ptr %{{[^,]+}}, ptr [[BPTRADDR:%[^,]+]] -// CK0: store ptr %{{[^,]+}}, ptr [[VPTRADDR:%[^,]+]] -// CK0: store i64 %{{[^,]+}}, ptr [[SIZEADDR:%[^,]+]] -// CK0: store i64 %{{[^,]+}}, ptr [[TYPEADDR:%[^,]+]] -// CK0-DAG: [[BYTESIZE:%.+]] = load i64, ptr [[SIZEADDR]] +// CK0: define {{.*}}void [[MPRFUNC:@[.]omp_mapper[.].*C[.]id]](ptr noundef [[HANDLE:%.+]], ptr noundef [[BPTR:%.+]], ptr noundef [[BEGIN:%.+]], i64 noundef [[BYTESIZE:%.+]], i64 noundef [[TYPE:%.+]], ptr{{.*}}) // CK0-64-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 16 // CK0-32-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 8 -// CK0-DAG: [[TYPE:%.+]] = load i64, ptr [[TYPEADDR]] -// CK0-DAG: [[HANDLE:%.+]] = load ptr, ptr [[HANDLEADDR]] -// CK0-DAG: [[BPTR:%.+]] = load ptr, ptr [[BPTRADDR]] -// CK0-DAG: [[BEGIN:%.+]] = load ptr, ptr [[VPTRADDR]] // CK0-DAG: [[ISARRAY:%.+]] = icmp sgt i64 [[SIZE]], 1 // CK0-DAG: [[PTREND:%.+]] = getelementptr %class.C, ptr [[BEGIN]], i64 [[SIZE]] // CK0-DAG: [[PTRSNE:%.+]] = icmp ne ptr [[BPTR]], [[BEGIN]] @@ -597,18 +587,8 @@ class C { #pragma omp declare mapper(id: C s) map(s.a) -// CK1-LABEL: define {{.*}}void @.omp_mapper.{{.*}}C{{.*}}.id{{.*}}(ptr{{.*}}, ptr{{.*}}, ptr{{.*}}, i64{{.*}}, i64{{.*}}, ptr{{.*}}) -// CK1: store ptr %{{[^,]+}}, ptr [[HANDLEADDR:%[^,]+]] -// CK1: store ptr %{{[^,]+}}, ptr [[BPTRADDR:%[^,]+]] -// CK1: store ptr %{{[^,]+}}, ptr [[VPTRADDR:%[^,]+]] -// CK1: store i64 %{{[^,]+}}, ptr [[SIZEADDR:%[^,]+]] -// CK1: store i64 %{{[^,]+}}, ptr [[TYPEADDR:%[^,]+]] -// CK1-DAG: [[BYTESIZE:%.+]] = load i64, ptr [[SIZEADDR]] +// CK1: define {{.*}}void @.omp_mapper.{{.*}}C{{.*}}.id{{.*}}(ptr noundef [[HANDLE:%.+]], ptr noundef [[BPTR:%.+]], ptr noundef [[BEGIN:%.+]], i64 noundef [[BYTESIZE:%.+]], i64 noundef [[TYPE:%.+]], ptr{{.*}}) // CK1-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 4 -// CK1-DAG: [[TYPE:%.+]] = load i64, ptr [[TYPEADDR]] -// CK1-DAG: [[HANDLE:%.+]] = load ptr, ptr [[HANDLEADDR]] -// CK1-DAG: [[BPTR:%.+]] = load ptr, ptr [[BPTRADDR]] -// CK1-DAG: [[BEGIN:%.+]] = load ptr, ptr [[VPTRADDR]] // CK1-DAG: [[PTREND:%.+]] = getelementptr %class.C, ptr [[BEGIN]], i64 [[SIZE]] // CK1-DAG: [[ISARRAY:%.+]] = icmp sgt i64 [[SIZE]], 1 // CK1-DAG: [[PTRSNE:%.+]] = icmp ne ptr [[BPTR]], [[BEGIN]] @@ -717,18 +697,8 @@ class C { // CK2: define {{.*}}void [[BMPRFUNC:@[.]omp_mapper[.].*B[.]default]](ptr{{.*}}, ptr{{.*}}, ptr{{.*}}, i64{{.*}}, i64{{.*}}, ptr{{.*}}) -// CK2-LABEL: define {{.*}}void @.omp_mapper.{{.*}}C{{.*}}.id(ptr{{.*}}, ptr{{.*}}, ptr{{.*}}, i64{{.*}}, i64{{.*}}, ptr{{.*}}) -// CK2: store ptr %{{[^,]+}}, ptr [[HANDLEADDR:%[^,]+]] -// CK2: store ptr %{{[^,]+}}, ptr [[BPTRADDR:%[^,]+]] -// CK2: store ptr %{{[^,]+}}, ptr [[VPTRADDR:%[^,]+]] -// CK2: store i64 %{{[^,]+}}, ptr [[SIZEADDR:%[^,]+]] -// CK2: store i64 %{{[^,]+}}, ptr [[TYPEADDR:%[^,]+]] -// CK2-DAG: [[BYTESIZE:%.+]] = load i64, ptr [[SIZEADDR]] +// CK2: define {{.*}}void @.omp_mapper.{{.*}}C{{.*}}.id(ptr noundef [[HANDLE:%.+]], ptr noundef [[BPTR:%.+]], ptr noundef [[BEGIN:%.+]], i64 noundef [[BYTESIZE:%.+]], i64 noundef [[TYPE:%.+]], ptr{{.*}}) // CK2-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 16 -// CK2-DAG: [[TYPE:%.+]] = load i64, ptr [[TYPEADDR]] -// CK2-DAG: [[HANDLE:%.+]] = load ptr, ptr [[HANDLEADDR]] -// CK2-DAG: [[BPTR:%.+]] = load ptr, ptr [[BPTRADDR]] -// CK2-DAG: [[BEGIN:%.+]] = load ptr, ptr [[VPTRADDR]] // CK2-DAG: [[PTREND:%.+]] = getelementptr %class.C, ptr [[BEGIN]], i64 [[SIZE]] // CK2-DAG: [[ISARRAY:%.+]] = icmp sgt i64 [[SIZE]], 1 // CK2-DAG: [[PTRSNE:%.+]] = icmp ne ptr [[BPTR]], [[BEGIN]] @@ -921,19 +891,9 @@ class C { #pragma omp declare mapper(id: C s) map(s.a, s.b[0:2]) -// CK4: define {{.*}}void [[MPRFUNC:@[.]omp_mapper[.].*C[.]id]](ptr{{.*}}, ptr{{.*}}, ptr{{.*}}, i64{{.*}}, i64{{.*}}, ptr{{.*}}) -// CK4: store ptr %{{[^,]+}}, ptr [[HANDLEADDR:%[^,]+]] -// CK4: store ptr %{{[^,]+}}, ptr [[BPTRADDR:%[^,]+]] -// CK4: store ptr %{{[^,]+}}, ptr [[VPTRADDR:%[^,]+]] -// CK4: store i64 %{{[^,]+}}, ptr [[SIZEADDR:%[^,]+]] -// CK4: store i64 %{{[^,]+}}, ptr [[TYPEADDR:%[^,]+]] -// CK4-DAG: [[BYTESIZE:%.+]] = load i64, ptr [[SIZEADDR]] +// CK4: define {{.*}}void [[MPRFUNC:@[.]omp_mapper[.].*C[.]id]](ptr noundef [[HANDLE:%.+]], ptr noundef [[BPTR:%.+]], ptr noundef [[BEGIN:%.+]], i64 noundef [[BYTESIZE:%.+]], i64 noundef [[TYPE:%.+]], ptr{{.*}}) // CK4-64-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 16 // CK4-32-DAG: [[SIZE:%.+]] = udiv exact i64 [[BYTESIZE]], 8 -// CK4-DAG: [[TYPE:%.+]] = load i64, ptr [[TYPEADDR]] -// CK4-DAG: [[HANDLE:%.+]] = load ptr, ptr [[HANDLEADDR]] -// CK4-DAG: [[BPTR:%.+]] = load ptr, ptr [[BPTRADDR]] -// CK4-DAG: [[BEGIN:%.+]] = load ptr, ptr [[VPTRADDR]] // CK4-DAG: [[PTREND:%.+]] = getelementptr %class.C, ptr [[BEGIN]], i64 [[SIZE]] // CK4-DAG: [[ISARRAY:%.+]] = icmp sgt i64 [[SIZE]], 1 // CK4-DAG: [[PTRSNE:%.+]] = icmp ne ptr [[BPTR]], [[BEGIN]] diff --git a/clang/test/OpenMP/target_map_names.cpp b/clang/test/OpenMP/target_map_names.cpp index c1c2015609fb79..3ee28d3ce5ce97 100644 --- a/clang/test/OpenMP/target_map_names.cpp +++ b/clang/test/OpenMP/target_map_names.cpp @@ -201,9 +201,7 @@ void secondMapNameInClause() { // DEBUG: store ptr @[[NAME:.offload_mapnames.[0-9]+]], ptr %[[ARG:.+]] // CHECK-NOT: store ptr @[[NAME:.offload_mapnames.[0-9]+]], ptr %[[ARG:.+]] -// DEBUG: void @.omp_mapper._ZTS2S3.id(ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr noundef [[NAME_ARG:%.+]]) -// DEBUG: store ptr [[NAME_ARG]], ptr [[NAME_STACK:%.+]] -// DEBUG: [[MAPPER_NAME:%.+]] = load ptr, ptr [[NAME_STACK]] +// DEBUG: void @.omp_mapper._ZTS2S3.id(ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr noundef [[MAPPER_NAME:%.+]]) // DEBUG: call void @__tgt_push_mapper_component(ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, ptr [[MAPPER_NAME]]) #endif diff --git a/clang/test/OpenMP/target_map_names_attr.cpp b/clang/test/OpenMP/target_map_names_attr.cpp index cb108474b3561c..e6b0e1beb5bd5d 100644 --- a/clang/test/OpenMP/target_map_names_attr.cpp +++ b/clang/test/OpenMP/target_map_names_attr.cpp @@ -186,9 +186,7 @@ void secondMapNameInClause() { // DEBUG: store ptr @[[NAME:.offload_mapnames.[0-9]+]], ptr %[[ARG:.+]] // CHECK-NOT: store ptr @[[NAME:.offload_mapnames.[0-9]+]], ptr %[[ARG:.+]] -// DEBUG: void @.omp_mapper._ZTS2S3.id(ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr noundef [[NAME_ARG:%.+]]) -// DEBUG: store ptr [[NAME_ARG]], ptr [[NAME_STACK:%.+]] -// DEBUG: [[MAPPER_NAME:%.+]] = load ptr, ptr [[NAME_STACK]] +// DEBUG: void @.omp_mapper._ZTS2S3.id(ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr noundef [[MAPPER_NAME:%.+]]) // DEBUG: call void @__tgt_push_mapper_component(ptr %{{.*}}, ptr %{{.*}}, ptr %{{.*}}, i64 %{{.*}}, i64 %{{.*}}, ptr [[MAPPER_NAME]]) #endif diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp index 775f0b296b1b63..0fc6de0e4279a5 100644 --- a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp +++ b/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp @@ -109,30 +109,12 @@ void foo() { // CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1D.default // CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca i64, align 8 -// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca i64, align 8 -// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 -// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 -// CHECK-NEXT: store i64 [[TMP3]], ptr [[DOTADDR3]], align 8 -// CHECK-NEXT: store i64 [[TMP4]], ptr [[DOTADDR4]], align 8 -// CHECK-NEXT: store ptr [[TMP5]], ptr [[DOTADDR5]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTADDR3]], align 8 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = udiv exact i64 [[TMP6]], 12 -// CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[STRUCT_D:%.*]], ptr [[TMP9]], i64 [[TMP10]] -// CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTADDR4]], align 8 -// CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR5]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = udiv exact i64 [[TMP3]], 12 +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[STRUCT_D:%.*]], ptr [[TMP2]], i64 [[TMP10]] // CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP10]], 1 -// CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP12]], 8 -// CHECK-NEXT: [[TMP15:%.*]] = icmp ne ptr [[TMP8]], [[TMP9]] -// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP12]], 16 +// CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[TMP15:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP4]], 16 // CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP16]], 0 // CHECK-NEXT: [[TMP18:%.*]] = and i1 [[TMP15]], [[TMP17]] // CHECK-NEXT: [[TMP19:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP18]] @@ -141,15 +123,15 @@ void foo() { // CHECK-NEXT: br i1 [[TMP20]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]] // CHECK: .omp.array..init: // CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP10]], 12 -// CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP12]], -4 +// CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP4]], -4 // CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], 512 -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[TMP8]], ptr [[TMP9]], i64 [[TMP21]], i64 [[TMP23]], ptr [[TMP13]]) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP21]], i64 [[TMP23]], ptr [[TMP5]]) // CHECK-NEXT: br label [[OMP_ARRAYMAP_HEAD]] // CHECK: omp.arraymap.head: -// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP9]], [[TMP11]] +// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP11]] // CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]] // CHECK: omp.arraymap.body: -// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP9]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END25:%.*]] ] +// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END25:%.*]] ] // CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0 // CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 1 // CHECK-NEXT: [[H:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 2 @@ -158,10 +140,10 @@ void foo() { // CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[E]] to i64 // CHECK-NEXT: [[TMP27:%.*]] = sub i64 [[TMP25]], [[TMP26]] // CHECK-NEXT: [[TMP28:%.*]] = sdiv exact i64 [[TMP27]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) -// CHECK-NEXT: [[TMP29:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP7]]) +// CHECK-NEXT: [[TMP29:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]]) // CHECK-NEXT: [[TMP30:%.*]] = shl i64 [[TMP29]], 48 // CHECK-NEXT: [[TMP31:%.*]] = add nuw i64 0, [[TMP30]] -// CHECK-NEXT: [[TMP32:%.*]] = and i64 [[TMP12]], 3 +// CHECK-NEXT: [[TMP32:%.*]] = and i64 [[TMP4]], 3 // CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], 0 // CHECK-NEXT: br i1 [[TMP33]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]] // CHECK: omp.type.alloc: @@ -181,87 +163,87 @@ void foo() { // CHECK-NEXT: br label [[OMP_TYPE_END]] // CHECK: omp.type.end: // CHECK-NEXT: [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP34]], [[OMP_TYPE_ALLOC]] ], [ [[TMP36]], [[OMP_TYPE_TO]] ], [ [[TMP38]], [[OMP_TYPE_FROM]] ], [ [[TMP31]], [[OMP_TYPE_TO_ELSE]] ] -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 [[TMP28]], i64 [[OMP_MAPTYPE]], ptr null) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 [[TMP28]], i64 [[OMP_MAPTYPE]], ptr null) // CHECK-NEXT: [[TMP39:%.*]] = add nuw i64 281474976711171, [[TMP30]] -// CHECK-NEXT: [[TMP40:%.*]] = and i64 [[TMP12]], 3 +// CHECK-NEXT: [[TMP40:%.*]] = and i64 [[TMP4]], 3 // CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[TMP40]], 0 // CHECK-NEXT: br i1 [[TMP41]], label [[OMP_TYPE_ALLOC6:%.*]], label [[OMP_TYPE_ALLOC_ELSE7:%.*]] -// CHECK: omp.type.alloc6: +// CHECK: omp.type.alloc1: // CHECK-NEXT: [[TMP42:%.*]] = and i64 [[TMP39]], -4 // CHECK-NEXT: br label [[OMP_TYPE_END11:%.*]] -// CHECK: omp.type.alloc.else7: +// CHECK: omp.type.alloc.else2: // CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[TMP40]], 1 // CHECK-NEXT: br i1 [[TMP43]], label [[OMP_TYPE_TO8:%.*]], label [[OMP_TYPE_TO_ELSE9:%.*]] -// CHECK: omp.type.to8: +// CHECK: omp.type.to3: // CHECK-NEXT: [[TMP44:%.*]] = and i64 [[TMP39]], -3 // CHECK-NEXT: br label [[OMP_TYPE_END11]] -// CHECK: omp.type.to.else9: +// CHECK: omp.type.to.else4: // CHECK-NEXT: [[TMP45:%.*]] = icmp eq i64 [[TMP40]], 2 // CHECK-NEXT: br i1 [[TMP45]], label [[OMP_TYPE_FROM10:%.*]], label [[OMP_TYPE_END11]] -// CHECK: omp.type.from10: +// CHECK: omp.type.from5: // CHECK-NEXT: [[TMP46:%.*]] = and i64 [[TMP39]], -2 // CHECK-NEXT: br label [[OMP_TYPE_END11]] -// CHECK: omp.type.end11: +// CHECK: omp.type.end6: // CHECK-NEXT: [[OMP_MAPTYPE12:%.*]] = phi i64 [ [[TMP42]], [[OMP_TYPE_ALLOC6]] ], [ [[TMP44]], [[OMP_TYPE_TO8]] ], [ [[TMP46]], [[OMP_TYPE_FROM10]] ], [ [[TMP39]], [[OMP_TYPE_TO_ELSE9]] ] -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 4, i64 [[OMP_MAPTYPE12]], ptr null) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 4, i64 [[OMP_MAPTYPE12]], ptr null) // CHECK-NEXT: [[TMP47:%.*]] = add nuw i64 281474976711171, [[TMP30]] -// CHECK-NEXT: [[TMP48:%.*]] = and i64 [[TMP12]], 3 +// CHECK-NEXT: [[TMP48:%.*]] = and i64 [[TMP4]], 3 // CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[TMP48]], 0 // CHECK-NEXT: br i1 [[TMP49]], label [[OMP_TYPE_ALLOC13:%.*]], label [[OMP_TYPE_ALLOC_ELSE14:%.*]] -// CHECK: omp.type.alloc13: +// CHECK: omp.type.alloc8: // CHECK-NEXT: [[TMP50:%.*]] = and i64 [[TMP47]], -4 // CHECK-NEXT: br label [[OMP_TYPE_END18:%.*]] -// CHECK: omp.type.alloc.else14: +// CHECK: omp.type.alloc.else9: // CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[TMP48]], 1 // CHECK-NEXT: br i1 [[TMP51]], label [[OMP_TYPE_TO15:%.*]], label [[OMP_TYPE_TO_ELSE16:%.*]] -// CHECK: omp.type.to15: +// CHECK: omp.type.to10: // CHECK-NEXT: [[TMP52:%.*]] = and i64 [[TMP47]], -3 // CHECK-NEXT: br label [[OMP_TYPE_END18]] -// CHECK: omp.type.to.else16: +// CHECK: omp.type.to.else11: // CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[TMP48]], 2 // CHECK-NEXT: br i1 [[TMP53]], label [[OMP_TYPE_FROM17:%.*]], label [[OMP_TYPE_END18]] -// CHECK: omp.type.from17: +// CHECK: omp.type.from12: // CHECK-NEXT: [[TMP54:%.*]] = and i64 [[TMP47]], -2 // CHECK-NEXT: br label [[OMP_TYPE_END18]] -// CHECK: omp.type.end18: +// CHECK: omp.type.end13: // CHECK-NEXT: [[OMP_MAPTYPE19:%.*]] = phi i64 [ [[TMP50]], [[OMP_TYPE_ALLOC13]] ], [ [[TMP52]], [[OMP_TYPE_TO15]] ], [ [[TMP54]], [[OMP_TYPE_FROM17]] ], [ [[TMP47]], [[OMP_TYPE_TO_ELSE16]] ] -// CHECK-NEXT: call void @.omp_mapper._ZTS1C.default(ptr [[TMP7]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[F]], i64 4, i64 [[OMP_MAPTYPE19]], ptr null) #[[ATTR3]] +// CHECK-NEXT: call void @.omp_mapper._ZTS1C.default(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[F]], i64 4, i64 [[OMP_MAPTYPE19]], ptr null) #[[ATTR3]] // CHECK-NEXT: [[TMP55:%.*]] = add nuw i64 281474976711171, [[TMP30]] -// CHECK-NEXT: [[TMP56:%.*]] = and i64 [[TMP12]], 3 +// CHECK-NEXT: [[TMP56:%.*]] = and i64 [[TMP4]], 3 // CHECK-NEXT: [[TMP57:%.*]] = icmp eq i64 [[TMP56]], 0 // CHECK-NEXT: br i1 [[TMP57]], label [[OMP_TYPE_ALLOC20:%.*]], label [[OMP_TYPE_ALLOC_ELSE21:%.*]] -// CHECK: omp.type.alloc20: +// CHECK: omp.type.alloc15: // CHECK-NEXT: [[TMP58:%.*]] = and i64 [[TMP55]], -4 // CHECK-NEXT: br label [[OMP_TYPE_END25]] -// CHECK: omp.type.alloc.else21: +// CHECK: omp.type.alloc.else16: // CHECK-NEXT: [[TMP59:%.*]] = icmp eq i64 [[TMP56]], 1 // CHECK-NEXT: br i1 [[TMP59]], label [[OMP_TYPE_TO22:%.*]], label [[OMP_TYPE_TO_ELSE23:%.*]] -// CHECK: omp.type.to22: +// CHECK: omp.type.to17: // CHECK-NEXT: [[TMP60:%.*]] = and i64 [[TMP55]], -3 // CHECK-NEXT: br label [[OMP_TYPE_END25]] -// CHECK: omp.type.to.else23: +// CHECK: omp.type.to.else18: // CHECK-NEXT: [[TMP61:%.*]] = icmp eq i64 [[TMP56]], 2 // CHECK-NEXT: br i1 [[TMP61]], label [[OMP_TYPE_FROM24:%.*]], label [[OMP_TYPE_END25]] -// CHECK: omp.type.from24: +// CHECK: omp.type.from19: // CHECK-NEXT: [[TMP62:%.*]] = and i64 [[TMP55]], -2 // CHECK-NEXT: br label [[OMP_TYPE_END25]] -// CHECK: omp.type.end25: +// CHECK: omp.type.end20: // CHECK-NEXT: [[OMP_MAPTYPE26:%.*]] = phi i64 [ [[TMP58]], [[OMP_TYPE_ALLOC20]] ], [ [[TMP60]], [[OMP_TYPE_TO22]] ], [ [[TMP62]], [[OMP_TYPE_FROM24]] ], [ [[TMP55]], [[OMP_TYPE_TO_ELSE23]] ] -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[H]], i64 4, i64 [[OMP_MAPTYPE26]], ptr null) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[H]], i64 4, i64 [[OMP_MAPTYPE26]], ptr null) // CHECK-NEXT: [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1 // CHECK-NEXT: [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP11]] // CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]] // CHECK: omp.arraymap.exit: // CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY27:%.*]] = icmp sgt i64 [[TMP10]], 1 -// CHECK-NEXT: [[TMP63:%.*]] = and i64 [[TMP12]], 8 +// CHECK-NEXT: [[TMP63:%.*]] = and i64 [[TMP4]], 8 // CHECK-NEXT: [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP63]], 0 // CHECK-NEXT: [[TMP64:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY27]], [[DOTOMP_ARRAY__DEL__DELETE]] // CHECK-NEXT: br i1 [[TMP64]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]] // CHECK: .omp.array..del: // CHECK-NEXT: [[TMP65:%.*]] = mul nuw i64 [[TMP10]], 12 -// CHECK-NEXT: [[TMP66:%.*]] = and i64 [[TMP12]], -4 +// CHECK-NEXT: [[TMP66:%.*]] = and i64 [[TMP4]], -4 // CHECK-NEXT: [[TMP67:%.*]] = or i64 [[TMP66]], 512 -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[TMP8]], ptr [[TMP9]], i64 [[TMP65]], i64 [[TMP67]], ptr [[TMP13]]) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP65]], i64 [[TMP67]], ptr [[TMP5]]) // CHECK-NEXT: br label [[OMP_DONE]] // CHECK: omp.done: // CHECK-NEXT: ret void @@ -270,30 +252,12 @@ void foo() { // CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1C.default // CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca i64, align 8 -// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca i64, align 8 -// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 -// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 -// CHECK-NEXT: store i64 [[TMP3]], ptr [[DOTADDR3]], align 8 -// CHECK-NEXT: store i64 [[TMP4]], ptr [[DOTADDR4]], align 8 -// CHECK-NEXT: store ptr [[TMP5]], ptr [[DOTADDR5]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTADDR3]], align 8 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = udiv exact i64 [[TMP6]], 4 -// CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[TMP9]], i64 [[TMP10]] -// CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTADDR4]], align 8 -// CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTADDR5]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = udiv exact i64 [[TMP3]], 4 +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[TMP2]], i64 [[TMP10]] // CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP10]], 1 -// CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP12]], 8 -// CHECK-NEXT: [[TMP15:%.*]] = icmp ne ptr [[TMP8]], [[TMP9]] -// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP12]], 16 +// CHECK-NEXT: [[TMP14:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[TMP15:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP4]], 16 // CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP16]], 0 // CHECK-NEXT: [[TMP18:%.*]] = and i1 [[TMP15]], [[TMP17]] // CHECK-NEXT: [[TMP19:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP18]] @@ -302,20 +266,20 @@ void foo() { // CHECK-NEXT: br i1 [[TMP20]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]] // CHECK: .omp.array..init: // CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP10]], 4 -// CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP12]], -4 +// CHECK-NEXT: [[TMP22:%.*]] = and i64 [[TMP4]], -4 // CHECK-NEXT: [[TMP23:%.*]] = or i64 [[TMP22]], 512 -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[TMP8]], ptr [[TMP9]], i64 [[TMP21]], i64 [[TMP23]], ptr [[TMP13]]) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP21]], i64 [[TMP23]], ptr [[TMP5]]) // CHECK-NEXT: br label [[OMP_ARRAYMAP_HEAD]] // CHECK: omp.arraymap.head: -// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP9]], [[TMP11]] +// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP11]] // CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]] // CHECK: omp.arraymap.body: -// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP9]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END:%.*]] ] +// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END:%.*]] ] // CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0 -// CHECK-NEXT: [[TMP24:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP7]]) +// CHECK-NEXT: [[TMP24:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]]) // CHECK-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 48 // CHECK-NEXT: [[TMP26:%.*]] = add nuw i64 1, [[TMP25]] -// CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP12]], 3 +// CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP4]], 3 // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[TMP27]], 0 // CHECK-NEXT: br i1 [[TMP28]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]] // CHECK: omp.type.alloc: @@ -335,21 +299,21 @@ void foo() { // CHECK-NEXT: br label [[OMP_TYPE_END]] // CHECK: omp.type.end: // CHECK-NEXT: [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP29]], [[OMP_TYPE_ALLOC]] ], [ [[TMP31]], [[OMP_TYPE_TO]] ], [ [[TMP33]], [[OMP_TYPE_FROM]] ], [ [[TMP26]], [[OMP_TYPE_TO_ELSE]] ] -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[A]], i64 4, i64 [[OMP_MAPTYPE]], ptr null) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[A]], i64 4, i64 [[OMP_MAPTYPE]], ptr null) // CHECK-NEXT: [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1 // CHECK-NEXT: [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP11]] // CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]] // CHECK: omp.arraymap.exit: // CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY6:%.*]] = icmp sgt i64 [[TMP10]], 1 -// CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP12]], 8 +// CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP4]], 8 // CHECK-NEXT: [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP34]], 0 // CHECK-NEXT: [[TMP35:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY6]], [[DOTOMP_ARRAY__DEL__DELETE]] // CHECK-NEXT: br i1 [[TMP35]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]] // CHECK: .omp.array..del: // CHECK-NEXT: [[TMP36:%.*]] = mul nuw i64 [[TMP10]], 4 -// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP12]], -4 +// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP4]], -4 // CHECK-NEXT: [[TMP38:%.*]] = or i64 [[TMP37]], 512 -// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP7]], ptr [[TMP8]], ptr [[TMP9]], i64 [[TMP36]], i64 [[TMP38]], ptr [[TMP13]]) +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP36]], i64 [[TMP38]], ptr [[TMP5]]) // CHECK-NEXT: br label [[OMP_DONE]] // CHECK: omp.done: // CHECK-NEXT: ret void diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 1f0b129f867ae6..cc23d038d80443 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2858,6 +2858,67 @@ class OpenMPIRBuilder { using GenMapInfoCallbackTy = function_ref; +private: + /// Emit the array initialization or deletion portion for user-defined mapper + /// code generation. First, it evaluates whether an array section is mapped + /// and whether the \a MapType instructs to delete this section. If \a IsInit + /// is true, and \a MapType indicates to not delete this array, array + /// initialization code is generated. If \a IsInit is false, and \a MapType + /// indicates to delete this array, array deletion code is generated. + void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle, + llvm::Value *Base, llvm::Value *Begin, + llvm::Value *Size, llvm::Value *MapType, + llvm::Value *MapName, TypeSize ElementSize, + llvm::BasicBlock *ExitBB, bool IsInit); + +public: + /// Emit the user-defined mapper function. The code generation follows the + /// pattern in the example below. + /// \code + /// void .omp_mapper...(void *rt_mapper_handle, + /// void *base, void *begin, + /// int64_t size, int64_t type, + /// void *name = nullptr) { + /// // Allocate space for an array section first or add a base/begin for + /// // pointer dereference. + /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) && + /// !maptype.IsDelete) + /// __tgt_push_mapper_component(rt_mapper_handle, base, begin, + /// size*sizeof(Ty), clearToFromMember(type)); + /// // Map members. + /// for (unsigned i = 0; i < size; i++) { + /// // For each component specified by this mapper: + /// for (auto c : begin[i]->all_components) { + /// if (c.hasMapper()) + /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin, + /// c.arg_size, + /// c.arg_type, c.arg_name); + /// else + /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base, + /// c.arg_begin, c.arg_size, c.arg_type, + /// c.arg_name); + /// } + /// } + /// // Delete the array section. + /// if (size > 1 && maptype.IsDelete) + /// __tgt_push_mapper_component(rt_mapper_handle, base, begin, + /// size*sizeof(Ty), clearToFromMember(type)); + /// } + /// \endcode + /// + /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the + /// MapInfos and returns. + /// \param ElemTy DeclareMapper element type. + /// \param FuncName Optional param to specify mapper function name. + /// \param CustomMapperCB Optional callback to generate code related to + /// custom mappers. + Function *emitUserDefinedMapper( + function_ref + PrivAndGenMapInfoCB, + llvm::Type *ElemTy, StringRef FuncName = {}, + function_ref CustomMapperCB = nullptr); + /// Generator for '#omp target data' /// /// \param Loc The location where the target data construct was encountered. diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 764e2ca8fe4f41..fab85f6926ddaa 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -7722,6 +7722,297 @@ void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP, } } +void OpenMPIRBuilder::emitUDMapperArrayInitOrDel( + Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin, + Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize, + BasicBlock *ExitBB, bool IsInit) { + StringRef Prefix = IsInit ? ".init" : ".del"; + + // Evaluate if this is an array section. + BasicBlock *BodyBB = BasicBlock::Create( + M.getContext(), createPlatformSpecificName({"omp.array", Prefix})); + Value *IsArray = + Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray"); + Value *DeleteBit = Builder.CreateAnd( + MapType, + Builder.getInt64( + static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_DELETE))); + Value *DeleteCond; + Value *Cond; + if (IsInit) { + // base != begin? + Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin); + // IsPtrAndObj? + Value *PtrAndObjBit = Builder.CreateAnd( + MapType, + Builder.getInt64( + static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ))); + PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit); + BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit); + Cond = Builder.CreateOr(IsArray, BaseIsBegin); + DeleteCond = Builder.CreateIsNull( + DeleteBit, + createPlatformSpecificName({"omp.array", Prefix, ".delete"})); + } else { + Cond = IsArray; + DeleteCond = Builder.CreateIsNotNull( + DeleteBit, + createPlatformSpecificName({"omp.array", Prefix, ".delete"})); + } + Cond = Builder.CreateAnd(Cond, DeleteCond); + Builder.CreateCondBr(Cond, BodyBB, ExitBB); + + emitBlock(BodyBB, MapperFn); + // Get the array size by multiplying element size and element number (i.e., \p + // Size). + Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize)); + // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves + // memory allocation/deletion purpose only. + Value *MapTypeArg = Builder.CreateAnd( + MapType, + Builder.getInt64( + ~static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_TO | + OpenMPOffloadMappingFlags::OMP_MAP_FROM))); + MapTypeArg = Builder.CreateOr( + MapTypeArg, + Builder.getInt64( + static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT))); + + // Call the runtime API __tgt_push_mapper_component to fill up the runtime + // data structure. + Value *OffloadingArgs[] = {MapperHandle, Base, Begin, + ArraySize, MapTypeArg, MapName}; + Builder.CreateCall( + getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component), + OffloadingArgs); +} + +Function *OpenMPIRBuilder::emitUserDefinedMapper( + function_ref + GenMapInfoCB, + Type *ElemTy, StringRef FuncName, + function_ref CustomMapperCB) { + SmallVector Params; + Params.emplace_back(Builder.getPtrTy()); + Params.emplace_back(Builder.getPtrTy()); + Params.emplace_back(Builder.getPtrTy()); + Params.emplace_back(Builder.getInt64Ty()); + Params.emplace_back(Builder.getInt64Ty()); + Params.emplace_back(Builder.getPtrTy()); + + auto *FnTy = + FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false); + + SmallString<64> TyStr; + raw_svector_ostream Out(TyStr); + if (FuncName == "") + FuncName = StringRef{createPlatformSpecificName({"omp_mapper"})}; + Function *MapperFn = + Function::Create(FnTy, GlobalValue::InternalLinkage, FuncName, M); + MapperFn->addFnAttr(Attribute::NoInline); + MapperFn->addFnAttr(Attribute::NoUnwind); + MapperFn->addParamAttr(0, Attribute::NoUndef); + MapperFn->addParamAttr(1, Attribute::NoUndef); + MapperFn->addParamAttr(2, Attribute::NoUndef); + MapperFn->addParamAttr(3, Attribute::NoUndef); + MapperFn->addParamAttr(4, Attribute::NoUndef); + MapperFn->addParamAttr(5, Attribute::NoUndef); + + // Start the mapper function code generation. + BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn); + auto SavedIP = Builder.saveIP(); + Builder.SetInsertPoint(EntryBB); + + Value *MapperHandle = MapperFn->getArg(0); + Value *BaseIn = MapperFn->getArg(1); + Value *BeginIn = MapperFn->getArg(2); + Value *Size = MapperFn->getArg(3); + Value *MapType = MapperFn->getArg(4); + Value *MapName = MapperFn->getArg(5); + + // Compute the starting and end addresses of array elements. + // Prepare common arguments for array initiation and deletion. + // Convert the size in bytes into the number of array elements. + TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy); + Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize)); + Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy()); + Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size); + + // Emit array initiation if this is an array section and \p MapType indicates + // that memory allocation is required. + BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head"); + emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size, + MapType, MapName, ElementSize, HeadBB, + /*IsInit=*/true); + + // Emit a for loop to iterate through SizeArg of elements and map all of them. + + // Emit the loop header block. + emitBlock(HeadBB, MapperFn); + BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body"); + BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done"); + // Evaluate whether the initial condition is satisfied. + Value *IsEmpty = + Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty"); + Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB); + + // Emit the loop body block. + emitBlock(BodyBB, MapperFn); + BasicBlock *LastBB = BodyBB; + PHINode *PtrPHI = + Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent"); + PtrPHI->addIncoming(PtrBegin, HeadBB); + + // Get map clause information. Fill up the arrays with all mapped variables. + MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn); + + // Call the runtime API __tgt_mapper_num_components to get the number of + // pre-existing components. + Value *OffloadingArgs[] = {MapperHandle}; + Value *PreviousSize = Builder.CreateCall( + getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components), + OffloadingArgs); + Value *ShiftedPreviousSize = + Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset())); + + // Fill up the runtime mapper handle for all components. + for (unsigned I = 0; I < Info.BasePointers.size(); ++I) { + Value *CurBaseArg = + Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy()); + Value *CurBeginArg = + Builder.CreateBitCast(Info.Pointers[I], Builder.getPtrTy()); + Value *CurSizeArg = Info.Sizes[I]; + Value *CurNameArg = Info.Names.size() + ? Info.Names[I] + : Constant::getNullValue(Builder.getPtrTy()); + + // Extract the MEMBER_OF field from the map type. + Value *OriMapType = Builder.getInt64( + static_cast>( + Info.Types[I])); + Value *MemberMapType = + Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize); + + // Combine the map type inherited from user-defined mapper with that + // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM + // bits of the \a MapType, which is the input argument of the mapper + // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM + // bits of MemberMapType. + // [OpenMP 5.0], 1.2.6. map-type decay. + // | alloc | to | from | tofrom | release | delete + // ---------------------------------------------------------- + // alloc | alloc | alloc | alloc | alloc | release | delete + // to | alloc | to | alloc | to | release | delete + // from | alloc | alloc | from | from | release | delete + // tofrom | alloc | to | from | tofrom | release | delete + Value *LeftToFrom = Builder.CreateAnd( + MapType, + Builder.getInt64( + static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_TO | + OpenMPOffloadMappingFlags::OMP_MAP_FROM))); + BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc"); + BasicBlock *AllocElseBB = + BasicBlock::Create(M.getContext(), "omp.type.alloc.else"); + BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to"); + BasicBlock *ToElseBB = + BasicBlock::Create(M.getContext(), "omp.type.to.else"); + BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from"); + BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end"); + Value *IsAlloc = Builder.CreateIsNull(LeftToFrom); + Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB); + // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM. + emitBlock(AllocBB, MapperFn); + Value *AllocMapType = Builder.CreateAnd( + MemberMapType, + Builder.getInt64( + ~static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_TO | + OpenMPOffloadMappingFlags::OMP_MAP_FROM))); + Builder.CreateBr(EndBB); + emitBlock(AllocElseBB, MapperFn); + Value *IsTo = Builder.CreateICmpEQ( + LeftToFrom, + Builder.getInt64( + static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_TO))); + Builder.CreateCondBr(IsTo, ToBB, ToElseBB); + // In case of to, clear OMP_MAP_FROM. + emitBlock(ToBB, MapperFn); + Value *ToMapType = Builder.CreateAnd( + MemberMapType, + Builder.getInt64( + ~static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_FROM))); + Builder.CreateBr(EndBB); + emitBlock(ToElseBB, MapperFn); + Value *IsFrom = Builder.CreateICmpEQ( + LeftToFrom, + Builder.getInt64( + static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_FROM))); + Builder.CreateCondBr(IsFrom, FromBB, EndBB); + // In case of from, clear OMP_MAP_TO. + emitBlock(FromBB, MapperFn); + Value *FromMapType = Builder.CreateAnd( + MemberMapType, + Builder.getInt64( + ~static_cast>( + OpenMPOffloadMappingFlags::OMP_MAP_TO))); + // In case of tofrom, do nothing. + emitBlock(EndBB, MapperFn); + LastBB = EndBB; + PHINode *CurMapType = + Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype"); + CurMapType->addIncoming(AllocMapType, AllocBB); + CurMapType->addIncoming(ToMapType, ToBB); + CurMapType->addIncoming(FromMapType, FromBB); + CurMapType->addIncoming(MemberMapType, ToElseBB); + + Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg, + CurSizeArg, CurMapType, CurNameArg}; + Function *ChildMapperFn = nullptr; + if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) { + // Call the corresponding mapper function. + Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow(); + } else { + // Call the runtime API __tgt_push_mapper_component to fill up the runtime + // data structure. + Builder.CreateCall( + getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component), + OffloadingArgs); + } + } + + // Update the pointer to point to the next element that needs to be mapped, + // and check whether we have mapped all elements. + Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1, + "omp.arraymap.next"); + PtrPHI->addIncoming(PtrNext, LastBB); + Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone"); + BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit"); + Builder.CreateCondBr(IsDone, ExitBB, BodyBB); + + emitBlock(ExitBB, MapperFn); + // Emit array deletion if this is an array section and \p MapType indicates + // that deletion is required. + emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size, + MapType, MapName, ElementSize, DoneBB, + /*IsInit=*/false); + + // Emit the function exit block. + emitBlock(DoneBB, MapperFn, /*IsFinished=*/true); + + Builder.CreateRetVoid(); + Builder.restoreIP(SavedIP); + return MapperFn; +} + void OpenMPIRBuilder::emitOffloadingArrays( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous, From fc97d2e68b03bc2979395e84b645e5b3ba35aecd Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 18 Dec 2024 07:02:37 -0800 Subject: [PATCH 35/37] [flang] Add UNSIGNED (#113504) Implement the UNSIGNED extension type and operations under control of a language feature flag (-funsigned). This is nearly identical to the UNSIGNED feature that has been available in Sun Fortran for years, and now implemented in GNU Fortran for gfortran 15, and proposed for ISO standardization in J3/24-116.txt. See the new documentation for details; but in short, this is C's unsigned type, with guaranteed modular arithmetic for +, -, and *, and the related transformational intrinsic functions SUM & al. --- clang/include/clang/Driver/Options.td | 1 + clang/lib/Driver/ToolChains/Flang.cpp | 3 +- flang/docs/Extensions.md | 1 + flang/docs/Unsigned.md | 121 +++ flang/docs/index.md | 1 + flang/include/flang/Common/Fortran-consts.h | 6 +- flang/include/flang/Common/Fortran-features.h | 4 +- flang/include/flang/Common/Fortran.h | 3 +- flang/include/flang/Evaluate/complex.h | 5 +- flang/include/flang/Evaluate/expression.h | 47 +- flang/include/flang/Evaluate/fold.h | 11 + flang/include/flang/Evaluate/integer.h | 11 +- flang/include/flang/Evaluate/real.h | 3 +- flang/include/flang/Evaluate/tools.h | 10 +- flang/include/flang/Evaluate/type.h | 40 +- flang/include/flang/ISO_Fortran_binding.h | 7 +- .../flang/Optimizer/Builder/FIRBuilder.h | 25 + .../Optimizer/Builder/Runtime/RTBuilder.h | 89 +++ .../Dialect/CanonicalizationPatterns.td | 4 +- .../include/flang/Optimizer/Dialect/FIROps.td | 5 +- .../flang/Optimizer/Dialect/FIRTypes.td | 19 +- flang/include/flang/Optimizer/Support/Utils.h | 31 +- flang/include/flang/Parser/dump-parse-tree.h | 4 +- flang/include/flang/Parser/parse-tree.h | 19 +- flang/include/flang/Runtime/cpp-type.h | 4 + .../flang/Runtime/matmul-instances.inc | 72 ++ flang/include/flang/Runtime/numeric.h | 2 +- flang/include/flang/Runtime/reduce.h | 83 ++ flang/include/flang/Runtime/reduction.h | 93 +++ flang/include/flang/Semantics/expression.h | 5 +- flang/lib/Common/Fortran-features.cpp | 1 + flang/lib/Common/default-kinds.cpp | 1 + flang/lib/Evaluate/expression.cpp | 6 + flang/lib/Evaluate/fold-implementation.h | 58 +- flang/lib/Evaluate/fold-integer.cpp | 754 ++++++++++-------- flang/lib/Evaluate/fold-logical.cpp | 24 +- flang/lib/Evaluate/fold-matmul.h | 4 +- flang/lib/Evaluate/fold-reduction.h | 19 +- flang/lib/Evaluate/formatting.cpp | 9 +- flang/lib/Evaluate/intrinsics.cpp | 196 +++-- flang/lib/Evaluate/target.cpp | 2 + flang/lib/Evaluate/tools.cpp | 115 ++- flang/lib/Evaluate/type.cpp | 10 + flang/lib/Frontend/CompilerInvocation.cpp | 6 + flang/lib/Lower/Bridge.cpp | 9 +- flang/lib/Lower/ConvertConstant.cpp | 16 +- flang/lib/Lower/ConvertExpr.cpp | 127 ++- flang/lib/Lower/ConvertExprToHLFIR.cpp | 84 +- flang/lib/Lower/ConvertType.cpp | 8 +- flang/lib/Lower/IO.cpp | 35 +- flang/lib/Lower/Mangler.cpp | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 395 ++++++--- .../Optimizer/Builder/Runtime/Reduction.cpp | 191 +++++ .../Builder/Runtime/Transformational.cpp | 7 +- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 12 +- flang/lib/Optimizer/Dialect/FIRType.cpp | 53 +- flang/lib/Parser/Fortran-parsers.cpp | 21 +- flang/lib/Parser/type-parsers.h | 1 + flang/lib/Semantics/check-arithmeticif.cpp | 3 + flang/lib/Semantics/check-case.cpp | 8 +- flang/lib/Semantics/expression.cpp | 120 ++- flang/lib/Semantics/resolve-names.cpp | 11 + flang/lib/Semantics/scope.cpp | 1 + flang/lib/Semantics/tools.cpp | 8 +- flang/module/iso_c_binding.f90 | 29 + flang/module/iso_fortran_env.f90 | 7 + flang/module/iso_fortran_env_impl.f90 | 30 + flang/runtime/Float128Math/random.cpp | 2 +- flang/runtime/descriptor-io.h | 38 +- flang/runtime/dot-product.cpp | 23 + flang/runtime/edit-input.cpp | 26 +- flang/runtime/edit-input.h | 2 +- flang/runtime/edit-output.cpp | 14 +- flang/runtime/edit-output.h | 14 +- flang/runtime/extrema.cpp | 132 +++ flang/runtime/findloc.cpp | 26 +- flang/runtime/io-api-minimal.cpp | 2 +- flang/runtime/matmul.cpp | 9 +- flang/runtime/numeric.cpp | 4 +- flang/runtime/product.cpp | 43 + flang/runtime/random-templates.h | 25 +- flang/runtime/random.cpp | 61 +- flang/runtime/reduce.cpp | 214 +++++ flang/runtime/reduction-templates.h | 8 +- flang/runtime/reduction.cpp | 58 +- flang/runtime/sum.cpp | 33 + flang/runtime/tools.h | 20 +- flang/runtime/type-code.cpp | 29 + flang/runtime/type-info.cpp | 1 + flang/test/Evaluate/fold-unsigned.f90 | 120 +++ flang/test/Lower/Intrinsics/shifta.f90 | 10 +- flang/test/Lower/allocatable-polymorphic.f90 | 2 +- flang/test/Lower/unsigned-ops.f90 | 26 + flang/test/Semantics/complex01.f90 | 4 +- flang/test/Semantics/typeinfo01.f90 | 8 +- flang/test/Semantics/typeinfo08.f90 | 2 +- flang/test/Semantics/unsigned-errors.f90 | 77 ++ flang/unittests/Evaluate/real.cpp | 4 +- 98 files changed, 3348 insertions(+), 801 deletions(-) create mode 100644 flang/docs/Unsigned.md create mode 100644 flang/test/Evaluate/fold-unsigned.f90 create mode 100644 flang/test/Lower/unsigned-ops.f90 create mode 100644 flang/test/Semantics/unsigned-errors.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7b544d2534d469..14e47f083ecec4 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6913,6 +6913,7 @@ defm underscoring : OptInFC1FFlag<"underscoring", "Appends one trailing undersco defm ppc_native_vec_elem_order: BoolOptionWithoutMarshalling<"f", "ppc-native-vector-element-order", PosFlag, NegFlag>; +defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">; def fno_automatic : Flag<["-"], "fno-automatic">, Group, HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">; diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 8cffa20c4a2d36..7034e5b475c1d3 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -122,7 +122,8 @@ void Flang::addOtherOptions(const ArgList &Args, ArgStringList &CmdArgs) const { options::OPT_fintrinsic_modules_path, options::OPT_pedantic, options::OPT_std_EQ, options::OPT_W_Joined, options::OPT_fconvert_EQ, options::OPT_fpass_plugin_EQ, - options::OPT_funderscoring, options::OPT_fno_underscoring}); + options::OPT_funderscoring, options::OPT_fno_underscoring, + options::OPT_funsigned, options::OPT_fno_unsigned}); llvm::codegenoptions::DebugInfoKind DebugInfoKind; if (Args.hasArg(options::OPT_gN_Group)) { diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 4b4b516d0fb691..626bf4399d6325 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -420,6 +420,7 @@ end [-fimplicit-none-type-never] * Old-style `PARAMETER pi=3.14` statement without parentheses [-falternative-parameter-statement] +* `UNSIGNED` type (-funsigned) ### Extensions and legacy features deliberately not supported diff --git a/flang/docs/Unsigned.md b/flang/docs/Unsigned.md new file mode 100644 index 00000000000000..5c90e2aa185bca --- /dev/null +++ b/flang/docs/Unsigned.md @@ -0,0 +1,121 @@ + + +# Fortran Extensions supported by Flang + +```{contents} +--- +local: +--- +``` + +For better compatibility with GNU Fortran and Sun Fortran, +this compiler supports an option (`-funsigned`) that enables +the `UNSIGNED` data type, constants, intrinsic functions, +its use with intrinsic operations and `SELECT CASE`, and C +language interoperability. + +## `UNSIGNED` type + +`UNSIGNED` is a numeric type with the same kinds as `INTEGER`. +It may appear as a type-spec in any context, including +a type declaration statement, a type-decl in an array +constructor or `ALLOCATE` statement, `IMPLICIT`, or a +function statement's prefix. + +`UNSIGNED` constants are nonempty strings of decimal digits +followed by the letter `U` and optionally a kind suffix with +an underscore. + +## `UNSIGNED` operations + +`UNSIGNED` operands are accepted for unary negation (`-`), +the basic four binary arithmetic intrinsic operations `+`, `-`, `*`, and `/`, +components in complex constructors, +and for numeric relational operators. +The power operator `**` does not accept `UNSIGNED` operands. + +Mixed operations with other types are not allowed. +Mixed operations with one `UNSIGNED` operand and one BOZ literal +constant operand are allowed. +When the operands' kinds differ, the smaller operand is zero-extended +to the size of the larger. + +The arithmetic operations `u+v`, `-u`, `u-v`, and `u*v` are implemented +modulo `MAX(HUGE(u),HUGE(v))+1`; +informally speaking, they always truncate their results, or are +guaranteed to "wrap". + +## `UNSIGNED` intrinsic functions + +`UNSIGNED` operands are accepted as operands to, +or may be returned as results from, +several intrinsic procedures. + +Bitwise operations: +* `NOT` +* `IAND`, `IOR`, `IEOR`, `IBCLR`, `IBSET`, `IBITS`, `MERGE_BITS` +* `BTEST` +* `ISHFT`, `ISHFTC` +* `SHIFTA`, `SHIFTL`, `SHIFTR` +* `TRANSFER` +* `MVBITS` + +The existing unsigned comparisons `BLT`, `BLE`, `BGE`, and `BGT`. + +The inquiries `BIT_SIZE`, `DIGITS`, `HUGE`, and `RANGE`. + +Homogeneous `MAX` and `MIN`. + +`RANDOM_NUMBER`. + +The intrinsic array functions: +* `MAXVAL`, `MINVAL` +* `SUM`, `PRODUCT` +* `IALL`, `IANY`, `IPARITY` +* `DOT_PRODUCT`, `MATMUL` + +All of the restructuring array transformational intrinsics: `CSHIFT`, `EOSHIFT`, + `PACK`, `RESHAPE`, `SPREAD`, `TRANSPOSE`, and `UNPACK`. + +The location transformationals `FINDLOC`, `MAXLOC`, and `MINLOC`. + +There is a new `SELECTED_UNSIGNED_KIND` intrinsic function; it happens +to work identically to the existing `SELECTED_INT_KIND`. + +Two new intrinsic functions `UMASKL` and `UMASKR` work just like +`MASKL` and `MASKR`, returning unsigned results instead of integers. + +Conversions to `UNSIGNED`, or between `UNSIGNED` kinds, can be done +via the new `UINT` intrinsic. The `UNSIGNED` intrinsic name is also +supported as an alias. + +Support for `UNSIGNED` in the `OUT_OF_RANGE` predicate remains to be implemented. + +## Other usage + +`UNSIGNED` is allowed in `SELECT CASE`, but not in `DO` loop indices or +limits, or an arithmetic `IF` expression. + +`UNSIGNED` array indices are not allowed. + +`UNSIGNED` data may be used as data items in I/O statements, including +list-directed and `NAMELIST` I/O. +Format-directed I/O may edit `UNSIGNED` data with `I`, `G`, `B`, `O`, and `Z` +edit descriptors. + +## C interoperability + +`UNSIGNED` data map to type codes for C's `unsigned` types in the +`type` member of a `cdesc_t` descriptor in the `ISO_Fortran_binding.h` +header file. + +## Standard modules + +New definitions (`C_UNSIGNED`, `C_UINT8_T`, &c.) were added to ISO_C_BINDING +and new constants (`UINT8`, `UINT16`, &c.) to ISO_FORTRAN_ENV. diff --git a/flang/docs/index.md b/flang/docs/index.md index 70478fa0936d0b..c35f634746e68b 100644 --- a/flang/docs/index.md +++ b/flang/docs/index.md @@ -87,6 +87,7 @@ on how to get in touch with us and to learn more about the current status. f2018-grammar.md fstack-arrays Real16MathSupport + Unsigned ``` # Indices and tables diff --git a/flang/include/flang/Common/Fortran-consts.h b/flang/include/flang/Common/Fortran-consts.h index cf7884e7454c0c..3ce5b6ac7b6865 100644 --- a/flang/include/flang/Common/Fortran-consts.h +++ b/flang/include/flang/Common/Fortran-consts.h @@ -14,8 +14,10 @@ namespace Fortran::common { -// Fortran has five kinds of intrinsic data types, plus the derived types. -ENUM_CLASS(TypeCategory, Integer, Real, Complex, Character, Logical, Derived) +// Fortran has five kinds of standard intrinsic data types, the Unsigned +// extension, and derived types. +ENUM_CLASS( + TypeCategory, Integer, Unsigned, Real, Complex, Character, Logical, Derived) ENUM_CLASS(VectorElementCategory, Integer, Unsigned, Real) ENUM_CLASS(IoStmtKind, None, Backspace, Close, Endfile, Flush, Inquire, Open, diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h index b04f6117ae9656..44f88009f8f2c2 100644 --- a/flang/include/flang/Common/Fortran-features.h +++ b/flang/include/flang/Common/Fortran-features.h @@ -54,7 +54,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy, UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr, SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank, - IgnoreIrrelevantAttributes) + IgnoreIrrelevantAttributes, Unsigned) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, @@ -73,7 +73,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, PreviousScalarUse, RedeclaredInaccessibleComponent, ImplicitShared, IndexVarRedefinition, IncompatibleImplicitInterfaces, BadTypeForTarget, VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg, - MismatchingDummyProcedure, SubscriptedEmptyArray) + MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation) using LanguageFeatures = EnumSet; using UsageWarnings = EnumSet; diff --git a/flang/include/flang/Common/Fortran.h b/flang/include/flang/Common/Fortran.h index 72e4348a42a3f6..e1922f7654bb1a 100644 --- a/flang/include/flang/Common/Fortran.h +++ b/flang/include/flang/Common/Fortran.h @@ -23,7 +23,8 @@ namespace Fortran::common { class LanguageFeatureControl; constexpr bool IsNumericTypeCategory(TypeCategory category) { - return category == TypeCategory::Integer || category == TypeCategory::Real || + return category == TypeCategory::Integer || + category == TypeCategory::Unsigned || category == TypeCategory::Real || category == TypeCategory::Complex; } diff --git a/flang/include/flang/Evaluate/complex.h b/flang/include/flang/Evaluate/complex.h index 06eef842410944..2dcd28b59968cd 100644 --- a/flang/include/flang/Evaluate/complex.h +++ b/flang/include/flang/Evaluate/complex.h @@ -61,10 +61,11 @@ template class Complex { template static ValueWithRealFlags FromInteger(const INT &n, + bool isUnsigned = false, Rounding rounding = TargetCharacteristics::defaultRounding) { ValueWithRealFlags result; - result.value.re_ = - Part::FromInteger(n, rounding).AccumulateFlags(result.flags); + result.value.re_ = Part::FromInteger(n, isUnsigned, rounding) + .AccumulateFlags(result.flags); return result; } diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h index 2a40193e32306b..9ea037a2f7c429 100644 --- a/flang/include/flang/Evaluate/expression.h +++ b/flang/include/flang/Evaluate/expression.h @@ -209,10 +209,12 @@ template struct Convert : public Operation, TO, SomeKind> { // Fortran doesn't have conversions between kinds of CHARACTER apart from // assignments, and in those the data must be convertible to/from 7-bit ASCII. - static_assert(((TO::category == TypeCategory::Integer || - TO::category == TypeCategory::Real) && - (FROMCAT == TypeCategory::Integer || - FROMCAT == TypeCategory::Real)) || + static_assert( + ((TO::category == TypeCategory::Integer || + TO::category == TypeCategory::Real || + TO::category == TypeCategory::Unsigned) && + (FROMCAT == TypeCategory::Integer || FROMCAT == TypeCategory::Real || + FROMCAT == TypeCategory::Unsigned)) || TO::category == FROMCAT); using Result = TO; using Operand = SomeKind; @@ -526,7 +528,8 @@ class Expr> private: using Conversions = std::tuple, - Convert>; + Convert, + Convert>; using Operations = std::tuple, Negate, Add, Subtract, Multiply, Divide, Power, Extremum>; @@ -547,6 +550,29 @@ class Expr> u; }; +template +class Expr> + : public ExpressionBase> { +public: + using Result = Type; + + EVALUATE_UNION_CLASS_BOILERPLATE(Expr) + +private: + using Conversions = std::tuple, + Convert, + Convert>; + using Operations = + std::tuple, Negate, Add, + Subtract, Multiply, Divide, Extremum>; + using Others = std::tuple, ArrayConstructor, + Designator, FunctionRef>; + +public: + common::TupleToVariant> + u; +}; + template class Expr> : public ExpressionBase> { @@ -560,7 +586,8 @@ class Expr> // N.B. Real->Complex and Complex->Real conversions are done with CMPLX // and part access operations (resp.). using Conversions = std::variant, - Convert>; + Convert, + Convert>; using Operations = std::variant, Parentheses, Negate, Add, Subtract, Multiply, Divide, Power, RealToIntPower, Extremum>; @@ -590,6 +617,7 @@ class Expr> }; FOR_EACH_INTEGER_KIND(extern template class Expr, ) +FOR_EACH_UNSIGNED_KIND(extern template class Expr, ) FOR_EACH_REAL_KIND(extern template class Expr, ) FOR_EACH_COMPLEX_KIND(extern template class Expr, ) @@ -629,7 +657,8 @@ class Relational : public Operation, LogicalResult, T, T> { static_assert(Operand::category == TypeCategory::Integer || Operand::category == TypeCategory::Real || Operand::category == TypeCategory::Complex || - Operand::category == TypeCategory::Character); + Operand::category == TypeCategory::Character || + Operand::category == TypeCategory::Unsigned); CLASS_BOILERPLATE(Relational) Relational( RelationalOperator r, const Expr &a, const Expr &b) @@ -642,7 +671,7 @@ class Relational : public Operation, LogicalResult, T, T> { template <> class Relational { using DirectlyComparableTypes = common::CombineTuples; + ComplexTypes, CharacterTypes, UnsignedTypes>; public: using Result = LogicalResult; @@ -656,6 +685,7 @@ template <> class Relational { }; FOR_EACH_INTEGER_KIND(extern template class Relational, ) +FOR_EACH_UNSIGNED_KIND(extern template class Relational, ) FOR_EACH_REAL_KIND(extern template class Relational, ) FOR_EACH_CHARACTER_KIND(extern template class Relational, ) extern template class Relational; @@ -886,6 +916,7 @@ FOR_EACH_INTRINSIC_KIND(extern template class ArrayConstructor, ) FOR_EACH_INTRINSIC_KIND(template class Expr, ) \ FOR_EACH_CATEGORY_TYPE(template class Expr, ) \ FOR_EACH_INTEGER_KIND(template class Relational, ) \ + FOR_EACH_UNSIGNED_KIND(template class Relational, ) \ FOR_EACH_REAL_KIND(template class Relational, ) \ FOR_EACH_CHARACTER_KIND(template class Relational, ) \ template class Relational; \ diff --git a/flang/include/flang/Evaluate/fold.h b/flang/include/flang/Evaluate/fold.h index d2a153fb7919e4..b21c0f311fd35b 100644 --- a/flang/include/flang/Evaluate/fold.h +++ b/flang/include/flang/Evaluate/fold.h @@ -89,8 +89,19 @@ constexpr std::optional ToInt64( return std::nullopt; } } +template +constexpr std::optional ToInt64( + const Expr> &expr) { + if (auto scalar{ + GetScalarConstantValue>(expr)}) { + return scalar->ToInt64(); + } else { + return std::nullopt; + } +} std::optional ToInt64(const Expr &); +std::optional ToInt64(const Expr &); std::optional ToInt64(const Expr &); std::optional ToInt64(const ActualArgument &); diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h index e420eb75e3dff0..fccc2ad774a8fc 100644 --- a/flang/include/flang/Evaluate/integer.h +++ b/flang/include/flang/Evaluate/integer.h @@ -33,6 +33,12 @@ namespace Fortran::evaluate::value { +// Computes decimal range in the sense of SELECTED_INT_KIND +static constexpr int DecimalRange(int bits) { + // This magic value is LOG10(2.)*1E12. + return static_cast((bits * 301029995664) / 1000000000000); +} + // Implements an integer as an assembly of smaller host integer parts // that constitute the digits of a large-radix fixed-point number. // For best performance, the type of these parts should be half of the @@ -367,9 +373,8 @@ class Integer { static constexpr int DIGITS{bits - 1}; // don't count the sign bit static constexpr Integer HUGE() { return MASKR(bits - 1); } static constexpr Integer Least() { return MASKL(1); } - static constexpr int RANGE{// in the sense of SELECTED_INT_KIND - // This magic value is LOG10(2.)*1E12. - static_cast(((bits - 1) * 301029995664) / 1000000000000)}; + static constexpr int RANGE{DecimalRange(bits - 1)}; + static constexpr int UnsignedRANGE{DecimalRange(bits)}; constexpr bool IsZero() const { for (int j{0}; j < parts; ++j) { diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h index 11cc8f776b0e95..03294881850a13 100644 --- a/flang/include/flang/Evaluate/real.h +++ b/flang/include/flang/Evaluate/real.h @@ -288,8 +288,9 @@ template class Real { template static ValueWithRealFlags FromInteger(const INT &n, + bool isUnsigned = false, Rounding rounding = TargetCharacteristics::defaultRounding) { - bool isNegative{n.IsNegative()}; + bool isNegative{!isUnsigned && n.IsNegative()}; INT absN{n}; if (isNegative) { absN = n.Negate().value; // overflow is safe to ignore diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index dafacdf1ba0c5a..f586c59d46e54c 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -582,7 +582,8 @@ Expr ConvertToType(Expr> &&x) { template Expr ConvertToType(BOZLiteralConstant &&x) { static_assert(IsSpecificIntrinsicType); - if constexpr (TO::category == TypeCategory::Integer) { + if constexpr (TO::category == TypeCategory::Integer || + TO::category == TypeCategory::Unsigned) { return Expr{ Constant{Scalar::ConvertUnsigned(std::move(x)).value}}; } else { @@ -754,11 +755,11 @@ Expr> PromoteAndCombine( // one of the operands to the type of the other. Handles special cases with // typeless literal operands and with REAL/COMPLEX exponentiation to INTEGER // powers. -template