-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flang][acc] Add support for lowering combined constructs #80321
Conversation
@llvm/pr-subscribers-flang-fir-hlfir @llvm/pr-subscribers-openacc Author: Razvan Lupusoru (razvanlupusoru) ChangesPR#80319 added support to record combined construct semantics via an attribute. Add lowering support for this. Patch is 78.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80321.diff 6 Files Affected:
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 43f54c6d2a71b..a337a0f60113d 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1662,6 +1662,7 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
const Fortran::parser::DoConstruct &outerDoConstruct,
Fortran::lower::pft::Evaluation &eval,
const Fortran::parser::AccClauseList &accClauseList,
+ std::optional<mlir::acc::CombinedConstructsType> combinedConstructs,
bool needEarlyReturnHandling = false) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
@@ -2002,6 +2003,11 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
if (!collapseDeviceTypes.empty())
loopOp.setCollapseDeviceTypeAttr(builder.getArrayAttr(collapseDeviceTypes));
+ if (combinedConstructs) {
+ loopOp.setCombinedAttr(mlir::acc::CombinedConstructsTypeAttr::get(
+ builder.getContext(), combinedConstructs.value()));
+ }
+
return loopOp;
}
@@ -2047,7 +2053,7 @@ genACC(Fortran::lower::AbstractConverter &converter,
std::get<std::optional<Fortran::parser::DoConstruct>>(loopConstruct.t);
auto loopOp = createLoopOp(converter, currentLocation, semanticsContext,
stmtCtx, *outerDoConstruct, eval, accClauseList,
- needEarlyExitHandling);
+ {}, needEarlyExitHandling);
if (needEarlyExitHandling)
return loopOp.getResult(0);
@@ -2079,14 +2085,13 @@ static void genDataOperandOperationsWithModifier(
}
template <typename Op>
-static Op
-createComputeOp(Fortran::lower::AbstractConverter &converter,
- mlir::Location currentLocation,
- Fortran::lower::pft::Evaluation &eval,
- Fortran::semantics::SemanticsContext &semanticsContext,
- Fortran::lower::StatementContext &stmtCtx,
- const Fortran::parser::AccClauseList &accClauseList,
- bool outerCombined = false) {
+static Op createComputeOp(
+ Fortran::lower::AbstractConverter &converter,
+ mlir::Location currentLocation, Fortran::lower::pft::Evaluation &eval,
+ Fortran::semantics::SemanticsContext &semanticsContext,
+ Fortran::lower::StatementContext &stmtCtx,
+ const Fortran::parser::AccClauseList &accClauseList,
+ std::optional<mlir::acc::CombinedConstructsType> combinedConstructs) {
// Parallel operation operands
mlir::Value ifCond;
@@ -2279,7 +2284,7 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
} else if (const auto *privateClause =
std::get_if<Fortran::parser::AccClause::Private>(
&clause.u)) {
- if (!outerCombined)
+ if (!combinedConstructs)
genPrivatizations<mlir::acc::PrivateRecipeOp>(
privateClause->v, converter, semanticsContext, stmtCtx,
privateOperands, privatizations);
@@ -2297,7 +2302,7 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
// combined - delay it to the loop. However, a reduction clause on a
// combined construct implies a copy clause so issue an implicit copy
// instead.
- if (!outerCombined) {
+ if (!combinedConstructs) {
genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
reductionOperands, reductionRecipes);
} else {
@@ -2349,11 +2354,11 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
if constexpr (std::is_same_v<Op, mlir::acc::KernelsOp>)
computeOp = createRegionOp<Op, mlir::acc::TerminatorOp>(
builder, currentLocation, currentLocation, eval, operands,
- operandSegments, outerCombined);
+ operandSegments, combinedConstructs.has_value());
else
computeOp = createRegionOp<Op, mlir::acc::YieldOp>(
builder, currentLocation, currentLocation, eval, operands,
- operandSegments, outerCombined);
+ operandSegments, combinedConstructs.has_value());
if (addSelfAttr)
computeOp.setSelfAttrAttr(builder.getUnitAttr());
@@ -2406,6 +2411,11 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
mlir::ArrayAttr::get(builder.getContext(), firstPrivatizations));
}
+ if (combinedConstructs) {
+ computeOp.setCombinedAttr(mlir::acc::CombinedConstructsTypeAttr::get(
+ builder.getContext(), combinedConstructs.value()));
+ }
+
auto insPt = builder.saveInsertionPoint();
builder.setInsertionPointAfter(computeOp);
@@ -2681,18 +2691,18 @@ genACC(Fortran::lower::AbstractConverter &converter,
if (blockDirective.v == llvm::acc::ACCD_parallel) {
createComputeOp<mlir::acc::ParallelOp>(converter, currentLocation, eval,
semanticsContext, stmtCtx,
- accClauseList);
+ accClauseList, {});
} else if (blockDirective.v == llvm::acc::ACCD_data) {
genACCDataOp(converter, currentLocation, eval, semanticsContext, stmtCtx,
accClauseList);
} else if (blockDirective.v == llvm::acc::ACCD_serial) {
createComputeOp<mlir::acc::SerialOp>(converter, currentLocation, eval,
semanticsContext, stmtCtx,
- accClauseList);
+ accClauseList, {});
} else if (blockDirective.v == llvm::acc::ACCD_kernels) {
createComputeOp<mlir::acc::KernelsOp>(converter, currentLocation, eval,
semanticsContext, stmtCtx,
- accClauseList);
+ accClauseList, {});
} else if (blockDirective.v == llvm::acc::ACCD_host_data) {
genACCHostDataOp(converter, currentLocation, eval, semanticsContext,
stmtCtx, accClauseList);
@@ -2721,21 +2731,24 @@ genACC(Fortran::lower::AbstractConverter &converter,
if (combinedDirective.v == llvm::acc::ACCD_kernels_loop) {
createComputeOp<mlir::acc::KernelsOp>(
converter, currentLocation, eval, semanticsContext, stmtCtx,
- accClauseList, /*outerCombined=*/true);
+ accClauseList, mlir::acc::CombinedConstructsType::KernelsLoop);
createLoopOp(converter, currentLocation, semanticsContext, stmtCtx,
- *outerDoConstruct, eval, accClauseList);
+ *outerDoConstruct, eval, accClauseList,
+ mlir::acc::CombinedConstructsType::KernelsLoop);
} else if (combinedDirective.v == llvm::acc::ACCD_parallel_loop) {
createComputeOp<mlir::acc::ParallelOp>(
converter, currentLocation, eval, semanticsContext, stmtCtx,
- accClauseList, /*outerCombined=*/true);
+ accClauseList, mlir::acc::CombinedConstructsType::ParallelLoop);
createLoopOp(converter, currentLocation, semanticsContext, stmtCtx,
- *outerDoConstruct, eval, accClauseList);
+ *outerDoConstruct, eval, accClauseList,
+ mlir::acc::CombinedConstructsType::ParallelLoop);
} else if (combinedDirective.v == llvm::acc::ACCD_serial_loop) {
- createComputeOp<mlir::acc::SerialOp>(converter, currentLocation, eval,
- semanticsContext, stmtCtx,
- accClauseList, /*outerCombined=*/true);
+ createComputeOp<mlir::acc::SerialOp>(
+ converter, currentLocation, eval, semanticsContext, stmtCtx,
+ accClauseList, mlir::acc::CombinedConstructsType::SerialLoop);
createLoopOp(converter, currentLocation, semanticsContext, stmtCtx,
- *outerDoConstruct, eval, accClauseList);
+ *outerDoConstruct, eval, accClauseList,
+ mlir::acc::CombinedConstructsType::SerialLoop);
} else {
llvm::report_fatal_error("Unknown combined construct encountered");
}
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index d2134e8d2337c..74ee4e4687c86 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -37,13 +37,27 @@ subroutine acc_kernels_loop
! CHECK: %[[IFCONDITION:.*]] = fir.address_of(@{{.*}}ifcondition) : !fir.ref<!fir.logical<4>>
! CHECK: %[[DECLIFCONDITION:.*]]:2 = hlfir.declare %[[IFCONDITION]]
- !$acc kernels loop
+ !$acc kernels
+ !$acc loop
DO i = 1, n
a(i) = b(i)
END DO
+ !$acc end kernels
! CHECK: acc.kernels {
-! CHECK: acc.loop {{.*}} {
+! CHECK: acc.loop private{{.*}} {
+! CHECK: acc.yield
+! CHECK-NEXT: }{{$}}
+! CHECK: acc.terminator
+! CHECK-NEXT: }{{$}}
+
+ !$acc kernels loop
+ DO i = 1, n
+ a(i) = b(i)
+ END DO
+
+! CHECK: acc.kernels combined(kernels loop) {
+! CHECK: acc.loop combined(kernels loop) private{{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
! CHECK: acc.terminator
@@ -55,7 +69,7 @@ subroutine acc_kernels_loop
END DO
!$acc end kernels loop
-! CHECK: acc.kernels {
+! CHECK: acc.kernels {{.*}} {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -68,7 +82,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[ASYNC1:%.*]] = arith.constant 1 : i32
-! CHECK: acc.kernels async([[ASYNC1]] : i32) {
+! CHECK: acc.kernels {{.*}} async([[ASYNC1]] : i32) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -81,7 +95,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK: acc.kernels async([[ASYNC2]] : i32) {
+! CHECK: acc.kernels {{.*}} async([[ASYNC2]] : i32) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -93,7 +107,7 @@ subroutine acc_kernels_loop
a(i) = b(i)
END DO
-! CHECK: acc.kernels wait {
+! CHECK: acc.kernels {{.*}} wait {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -106,7 +120,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK: acc.kernels wait({[[WAIT1]] : i32}) {
+! CHECK: acc.kernels {{.*}} wait({[[WAIT1]] : i32}) {
! CHECK: acc.loop
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -120,7 +134,7 @@ subroutine acc_kernels_loop
! CHECK: [[WAIT2:%.*]] = arith.constant 1 : i32
! CHECK: [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK: acc.kernels wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK: acc.kernels {{.*}} wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
! CHECK: acc.loop
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -134,7 +148,7 @@ subroutine acc_kernels_loop
! CHECK: [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK: acc.kernels wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK: acc.kernels {{.*}} wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
! CHECK: acc.loop
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -147,7 +161,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK: acc.kernels num_gangs({[[NUMGANGS1]] : i32}) {
+! CHECK: acc.kernels {{.*}} num_gangs({[[NUMGANGS1]] : i32}) {
! CHECK: acc.loop
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -160,7 +174,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK: acc.kernels num_gangs({[[NUMGANGS2]] : i32}) {
+! CHECK: acc.kernels {{.*}} num_gangs({[[NUMGANGS2]] : i32}) {
! CHECK: acc.loop
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -173,7 +187,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[NUMWORKERS1:%.*]] = arith.constant 10 : i32
-! CHECK: acc.kernels num_workers([[NUMWORKERS1]] : i32) {
+! CHECK: acc.kernels {{.*}} num_workers([[NUMWORKERS1]] : i32) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -186,7 +200,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[NUMWORKERS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK: acc.kernels num_workers([[NUMWORKERS2]] : i32) {
+! CHECK: acc.kernels {{.*}} num_workers([[NUMWORKERS2]] : i32) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -199,7 +213,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[VECTORLENGTH1:%.*]] = arith.constant 128 : i32
-! CHECK: acc.kernels vector_length([[VECTORLENGTH1]] : i32) {
+! CHECK: acc.kernels {{.*}} vector_length([[VECTORLENGTH1]] : i32) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -212,7 +226,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[VECTORLENGTH2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK: acc.kernels vector_length([[VECTORLENGTH2]] : i32) {
+! CHECK: acc.kernels {{.*}} vector_length([[VECTORLENGTH2]] : i32) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -225,7 +239,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[IF1:%.*]] = arith.constant true
-! CHECK: acc.kernels if([[IF1]]) {
+! CHECK: acc.kernels {{.*}} if([[IF1]]) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -239,7 +253,7 @@ subroutine acc_kernels_loop
! CHECK: [[IFCOND:%.*]] = fir.load %{{.*}} : !fir.ref<!fir.logical<4>>
! CHECK: [[IF2:%.*]] = fir.convert [[IFCOND]] : (!fir.logical<4>) -> i1
-! CHECK: acc.kernels if([[IF2]]) {
+! CHECK: acc.kernels {{.*}} if([[IF2]]) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -252,7 +266,7 @@ subroutine acc_kernels_loop
END DO
! CHECK: [[SELF1:%.*]] = arith.constant true
-! CHECK: acc.kernels self([[SELF1]]) {
+! CHECK: acc.kernels {{.*}} self([[SELF1]]) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -264,7 +278,7 @@ subroutine acc_kernels_loop
a(i) = b(i)
END DO
-! CHECK: acc.kernels {
+! CHECK: acc.kernels {{.*}}{
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -278,7 +292,7 @@ subroutine acc_kernels_loop
! CHECK: %[[SELF2:.*]] = fir.convert %[[DECLIFCONDITION]]#1 : (!fir.ref<!fir.logical<4>>) -> i1
-! CHECK: acc.kernels self(%[[SELF2]]) {
+! CHECK: acc.kernels {{.*}} self(%[[SELF2]]) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -293,7 +307,7 @@ subroutine acc_kernels_loop
! CHECK: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
! CHECK: %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
-! CHECK: acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -309,7 +323,7 @@ subroutine acc_kernels_loop
! CHECK: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
! CHECK: %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
-! CHECK: acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -325,7 +339,7 @@ subroutine acc_kernels_loop
! CHECK: %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
! CHECK: %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
-! CHECK: acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -339,7 +353,7 @@ subroutine acc_kernels_loop
! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
-! CHECK: acc.kernels dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -355,7 +369,7 @@ subroutine acc_kernels_loop
! CHECK: %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
! CHECK: %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_create_zero>, name = "a"}
-! CHECK: acc.kernels dataOperands(%[[CREATE_B]], %[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[CREATE_B]], %[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -371,7 +385,7 @@ subroutine acc_kernels_loop
! CHECK: %[[NOCREATE_A:.*]] = acc.nocreate varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
! CHECK: %[[NOCREATE_B:.*]] = acc.nocreate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
-! CHECK: acc.kernels dataOperands(%[[NOCREATE_A]], %[[NOCREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[NOCREATE_A]], %[[NOCREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -385,7 +399,7 @@ subroutine acc_kernels_loop
! CHECK: %[[PRESENT_A:.*]] = acc.present varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
! CHECK: %[[PRESENT_B:.*]] = acc.present varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
-! CHECK: acc.kernels dataOperands(%[[PRESENT_A]], %[[PRESENT_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+! CHECK: acc.kernels {{.*}} dataOperands(%[[PRESENT_A]], %[[PRESENT_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
! CHECK: acc.loop {{.*}} {
! CHECK: acc.yield
! CHECK-NEXT: }{{$}}
@@ -399,7 +413,7 @@ subro...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 16c4843d3263050b359733a05e86cc4a09361aed 0f92463207da97f74a4a303fc53c6c1a4b2eab3c -- flang/lib/Lower/OpenACC.cpp View the diff from clang-format here.diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index a337a0f601..ad00b3f6c1 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1654,16 +1654,16 @@ static void privatizeIv(Fortran::lower::AbstractConverter &converter,
converter.bindSymbol(sym, op.getAccPtr());
}
-static mlir::acc::LoopOp
-createLoopOp(Fortran::lower::AbstractConverter &converter,
- mlir::Location currentLocation,
- Fortran::semantics::SemanticsContext &semanticsContext,
- Fortran::lower::StatementContext &stmtCtx,
- const Fortran::parser::DoConstruct &outerDoConstruct,
- Fortran::lower::pft::Evaluation &eval,
- const Fortran::parser::AccClauseList &accClauseList,
- std::optional<mlir::acc::CombinedConstructsType> combinedConstructs,
- bool needEarlyReturnHandling = false) {
+static mlir::acc::LoopOp createLoopOp(
+ Fortran::lower::AbstractConverter &converter,
+ mlir::Location currentLocation,
+ Fortran::semantics::SemanticsContext &semanticsContext,
+ Fortran::lower::StatementContext &stmtCtx,
+ const Fortran::parser::DoConstruct &outerDoConstruct,
+ Fortran::lower::pft::Evaluation &eval,
+ const Fortran::parser::AccClauseList &accClauseList,
+ std::optional<mlir::acc::CombinedConstructsType> combinedConstructs,
+ bool needEarlyReturnHandling = false) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couple of nits.
If you want to do a stack PR you can open the first PR on a branch on the llvm-project repo and then make this PR against the other branch.
flang/lib/Lower/OpenACC.cpp
Outdated
@@ -2002,6 +2003,11 @@ createLoopOp(Fortran::lower::AbstractConverter &converter, | |||
if (!collapseDeviceTypes.empty()) | |||
loopOp.setCollapseDeviceTypeAttr(builder.getArrayAttr(collapseDeviceTypes)); | |||
|
|||
if (combinedConstructs) { | |||
loopOp.setCombinedAttr(mlir::acc::CombinedConstructsTypeAttr::get( | |||
builder.getContext(), combinedConstructs.value())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
builder.getContext(), combinedConstructs.value())); | |
builder.getContext(), *combinedConstructs)); |
flang/lib/Lower/OpenACC.cpp
Outdated
@@ -2047,7 +2053,7 @@ genACC(Fortran::lower::AbstractConverter &converter, | |||
std::get<std::optional<Fortran::parser::DoConstruct>>(loopConstruct.t); | |||
auto loopOp = createLoopOp(converter, currentLocation, semanticsContext, | |||
stmtCtx, *outerDoConstruct, eval, accClauseList, | |||
needEarlyExitHandling); | |||
{}, needEarlyExitHandling); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{}, needEarlyExitHandling); | |
/*combinedConstructs=*/{}, needEarlyExitHandling); |
Or you can put a default value with the argument
flang/lib/Lower/OpenACC.cpp
Outdated
Fortran::semantics::SemanticsContext &semanticsContext, | ||
Fortran::lower::StatementContext &stmtCtx, | ||
const Fortran::parser::AccClauseList &accClauseList, | ||
std::optional<mlir::acc::CombinedConstructsType> combinedConstructs) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::optional<mlir::acc::CombinedConstructsType> combinedConstructs) { | |
std::optional<mlir::acc::CombinedConstructsType> combinedConstructs = std::nullopt) { |
So we don't need to pass {}
when there is not combined construct.
flang/lib/Lower/OpenACC.cpp
Outdated
@@ -2681,18 +2691,18 @@ genACC(Fortran::lower::AbstractConverter &converter, | |||
if (blockDirective.v == llvm::acc::ACCD_parallel) { | |||
createComputeOp<mlir::acc::ParallelOp>(converter, currentLocation, eval, | |||
semanticsContext, stmtCtx, | |||
accClauseList); | |||
accClauseList, {}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
accClauseList, {}); | |
accClauseList, /*combinedConstructs=*/{}); |
IF you want to keep it this way
flang/lib/Lower/OpenACC.cpp
Outdated
} else if (blockDirective.v == llvm::acc::ACCD_data) { | ||
genACCDataOp(converter, currentLocation, eval, semanticsContext, stmtCtx, | ||
accClauseList); | ||
} else if (blockDirective.v == llvm::acc::ACCD_serial) { | ||
createComputeOp<mlir::acc::SerialOp>(converter, currentLocation, eval, | ||
semanticsContext, stmtCtx, | ||
accClauseList); | ||
accClauseList, {}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
accClauseList, {}); | |
accClauseList, /*combinedConstructs=*/{}); |
flang/lib/Lower/OpenACC.cpp
Outdated
} else if (blockDirective.v == llvm::acc::ACCD_kernels) { | ||
createComputeOp<mlir::acc::KernelsOp>(converter, currentLocation, eval, | ||
semanticsContext, stmtCtx, | ||
accClauseList); | ||
accClauseList, {}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
accClauseList, {}); | |
accClauseList, /*combinedConstructs=*/{}); |
This patch adjusts argument passing for `APInt` to improve the compile-time. Compile-time improvement: https://llvm-compile-time-tracker.com/compare.php?from=d1f182c895728d89c5c3d198b133e212a5d9d4a3&to=ba3e326def3a6e5cd6d72ff5a49c74fba18de1df&stat=instructions:u
Prepare for dag-isel, also migrate some test case
The zero-value DT_JMPREL is benign but not needed. This is also code simplification available after https://reviews.llvm.org/D65651
…)" This reverts commit 01b1b0c. Breaks following AArch64 SVE buildbots: https://lab.llvm.org/buildbot/#/builders/184/builds/11363 https://lab.llvm.org/buildbot/#/builders/176/builds/9331
Here we introduce three new GMIR instructions to cover a set of trap intrinsics. The idea behind it is that generic intrinsics shouldn't be used with G_INTRINSIC opcode. These new instructions can match perfectly with existing trap ISD nodes. It allows X86, AArch64, RISCV and Mips to reuse SelectionDAG patterns for selection and avoid manual selection. However AMDGPU is an exception. It selects traps during legalization regardless SelectionDAG or GlobalISel. Since there are not many places where traps are used, this change attempts to clean up all the usages of G_INTRINSIC with trap intrinsics. So, there is no stage when both G_TRAP and G_INTRINSIC_W_SIDE_EFFECTS(@llvm.trap) are allowed.
…odules (llvm#85917) Clang modules take a significant compile time hit when pushing and popping diagnostics. Since all the headers are marked as system headers in the modulemap, we can simply disable this pushing and popping when building with clang modules.
Remove getSizeOrUnknown call when MachineMemOperand is created. For Scalable TypeSize, the MemoryType created becomes a scalable_vector. 2 MMOs that have scalable memory access can then use the updated BasicAA that understands scalable LocationSize. Original Patch by Harvin Iriawan Co-authored-by: David Green <david.green@arm.com>
``` --------------------------------------------------- Benchmark old new --------------------------------------------------- bm_mismatch<char>/1 0.835 ns 2.37 ns bm_mismatch<char>/2 1.44 ns 2.60 ns bm_mismatch<char>/3 2.06 ns 2.83 ns bm_mismatch<char>/4 2.60 ns 3.29 ns bm_mismatch<char>/5 3.15 ns 3.77 ns bm_mismatch<char>/6 3.82 ns 4.17 ns bm_mismatch<char>/7 4.29 ns 4.52 ns bm_mismatch<char>/8 4.78 ns 4.86 ns bm_mismatch<char>/16 9.06 ns 7.54 ns bm_mismatch<char>/64 31.7 ns 19.1 ns bm_mismatch<char>/512 249 ns 8.16 ns bm_mismatch<char>/4096 1956 ns 44.2 ns bm_mismatch<char>/32768 15498 ns 501 ns bm_mismatch<char>/262144 123965 ns 4479 ns bm_mismatch<char>/1048576 495668 ns 21306 ns bm_mismatch<short>/1 0.710 ns 2.12 ns bm_mismatch<short>/2 1.03 ns 2.66 ns bm_mismatch<short>/3 1.29 ns 3.56 ns bm_mismatch<short>/4 1.68 ns 4.29 ns bm_mismatch<short>/5 1.96 ns 5.18 ns bm_mismatch<short>/6 2.59 ns 5.91 ns bm_mismatch<short>/7 2.86 ns 6.63 ns bm_mismatch<short>/8 3.19 ns 7.33 ns bm_mismatch<short>/16 5.48 ns 13.0 ns bm_mismatch<short>/64 16.6 ns 4.06 ns bm_mismatch<short>/512 130 ns 13.8 ns bm_mismatch<short>/4096 985 ns 93.8 ns bm_mismatch<short>/32768 7846 ns 1002 ns bm_mismatch<short>/262144 63217 ns 10637 ns bm_mismatch<short>/1048576 251782 ns 42471 ns bm_mismatch<int>/1 0.716 ns 1.91 ns bm_mismatch<int>/2 1.21 ns 2.49 ns bm_mismatch<int>/3 1.38 ns 3.46 ns bm_mismatch<int>/4 1.71 ns 4.04 ns bm_mismatch<int>/5 2.00 ns 4.98 ns bm_mismatch<int>/6 2.43 ns 5.67 ns bm_mismatch<int>/7 3.05 ns 6.38 ns bm_mismatch<int>/8 3.22 ns 7.09 ns bm_mismatch<int>/16 5.18 ns 12.8 ns bm_mismatch<int>/64 16.6 ns 5.28 ns bm_mismatch<int>/512 129 ns 25.2 ns bm_mismatch<int>/4096 1009 ns 201 ns bm_mismatch<int>/32768 7776 ns 2144 ns bm_mismatch<int>/262144 62371 ns 20551 ns bm_mismatch<int>/1048576 254750 ns 90097 ns ```
Use `LINK_COMPONENTS` parameter of `add_llvm_library` rather than passing LLVM components directly to `target_link_libraries`, in order to ensure that LLVM dylib is linked correctly when used. Otherwise, CMake insists on linking to static libraries that aren't present on distributions doing pure dylib installs, such as Gentoo. This fixes a regression introduced in dcbddc2.
If any return from overwriteChangedFiles is true some fixes were not applied.
Commit f44db24 (2015) enabled this simplication.
MIPS is different and should better off use separate code.
…lvm#84464) Instead of keeping a mapping of Inst->VPValues (of their corresponding recipes) in VPlan's Value2VPValue mapping, keep it in VPRecipeBuilder instead. After recently replacing the last user of this mapping after initial construction, this mapping is only needed for recipe construction (to map IR operands to VPValue operands). By moving the mapping, VPlan's VPValue tracking can be simplified and limited only to live-ins. It also allows removing disableValue2VPValue and associated machinery & asserts. PR: llvm#84464
…ion (llvm#86386) Move the code adding top-level cmake/Modules directory to CMAKE_MODULE_PATH prior to including `GetDarwinLinkerVersion`, in order to fix standalone builds. Fixes a regression introduced by 3bc71c2.
Hide the implementations of `FuncHashes` and `BBHashMap` classes, getting rid of `at` accessors that could throw an exception. Test Plan: NFC Reviewers: ayermolo, maksfb, dcci, rafaelauler Reviewed By: rafaelauler Pull Request: llvm#86353
Attach branch counters to YAML profile, covering intra-function control flow. Depends on: llvm#86353 Test Plan: Updated bolt/test/X86/bolt-address-translation-yaml.test Reviewers: rafaelauler, dcci, ayermolo, maksfb Reviewed By: rafaelauler Pull Request: llvm#76911
This patch removes APIs that creating NUW neg. It is a trivial case because `sub nuw 0, X` always gets simplified into zero. I believe there is no optimization opportunities in the real-world applications that we can take advantage of the nuw flag. Motivated by llvm#84792 (comment). Compile-time improvement: https://llvm-compile-time-tracker.com/compare.php?from=d1f182c895728d89c5c3d198b133e212a5d9d4a3&to=da7b7478b7cbb32c09d760f6b8d0e67901e0d533&stat=instructions:u
Fixed the printing of templated argument list and added test case.
…NFC. (llvm#86259) This patch passes APInt by const reference in m_SpecificInt instead of by value. Specifically, it refactors `m_SpecificInt(uint64_t V)` to avoid APInt construction and dangling reference. I believe it is safe to pass the APInt by const reference into `m_SpecificInt` even if it is a temporary. See also https://en.cppreference.com/w/cpp/language/lifetime > All temporary objects are destroyed as the last step in evaluating the [full-expression](https://en.cppreference.com/w/cpp/language/expressions#Full-expressions) that (lexically) contains the point where they were created Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=d1f182c895728d89c5c3d198b133e212a5d9d4a3&to=7edf459b95ab2be33b70ec67faf87b3b8cc84f09&stat=instructions:u
…5911) Currently patchpoints can only have two result types, `void` and `i64`. This limits the result to general purpose registers. This patch makes `patchpoint.i64` an overloadable intrinsic, allowing result values that can fit in a single register (e.g. integers, pointers, floats).
Need to include initial sext/zext/trunc nodes to the list of the demoted root values to correctly calculate the cost and handle the vectorization.
…tadata (llvm#86431) Similar to llvm#80829 for GlobalISel.
Added a new variant of the CHECK() function that takes a custom message as a parameter. This is useful for more meaninful error messages when the compiler is expected to crash. Fixes llvm#78931
…lvm#84864) This change: - Updates the existing Clang User's Manual section on SPGO so that it describes how to use llvm-profgen to perform SPGO on Windows. This is new functionality implemented in llvm#83972. - Fixes a minor typo in the existing llvm-profgen invocation example. - Adds an LLVM release note on this new functionality in llvm-profgen.
…#86655) This matches the CMake targets and reduces the number of headers that need to be included in multiple targets.
…5378) Using remove() on DeclContext::lookup_result list invalidates iterators. This assertion failure was one (fortunate) symptom: ``` clang/include/clang/AST/DeclBase.h:1337: reference clang::DeclListNode::iterator::operator*() const: Assertion `Ptr && "dereferencing end() iterator"' failed. ```
…ich are needed to authenticate signed pointers (llvm#67454)" (llvm#86674) This reverts commit 8bd1f91. It appears that the commit broke msan bots.
These tests show invalid tbaa.struct metadata that is currently accepted in preparation for a change to the IR Verifier that will then reject it. PR: llvm#86167
- Revamped lowering conversion pattern for `tosa.reshape` to handle previously unsupported combinations of dynamic dimensions in input and output tensors. The lowering strategy continues to rely on pairs `tensor.collapse_shape` + `tensor.expand_shape`, which allow for downstream fusion with surrounding `linalg.generic` ops. - Fixed bug in canonicalization pattern `ReshapeOp::fold()` in `TosaCanonicalizations.cpp`. The input and result types being equal is not a sufficient condition for folding. If there is more than 1 dynamic dimension in the input and result types, a productive reshape could still occur. - This work exposed the fact that bufferization does not properly handle a `tensor.collapse_shape` op producing a 0D tensor from a dynamically shaped one due to a limitation in `memref.collapse_shape`. While the proper way to address this would involve releasing the `memref.collapse_shape` restriction and verifying correct bufferization, this is left as possible future work. For now, this scenario is avoided by casting the `tosa.reshape` input tensor to a static shape if necessary (see `inferReshapeInputType()`. - An extended set of tests are intended to cover relevant conversion paths. Tests are named using pattern `test_reshape_<rank>_{up|down|same}_{s2s|s2d|d2s|d2d}_{explicit|auto}[_empty][_identity]`, where: - `<rank>` is the input rank (e.g., 3d, 6d) - `{up|down|same}` indicates whether the reshape increases, decreases, or retains the input rank. - `{s2s|s2d|d2s|d2d}` indicates whether reshape converts a statically shaped input to a statically shaped result (`s2s`), a statically shaped input to a dynamically shaped result (`s2d`), etc. - `{explicit|auto}` is used to indicate that all values in the `new_shape` attribute are >=0 (`explicit`) or that a -1 placeholder value is used (`auto`). - `empty` is used to indicate that `new_shape` includes a component set to 0. - `identity` is used when the input and result shapes are the same.
Manually merged tests
Woah - I don't know how I caused this mess on rebase - please ignore this. Sorry to all of the implicit reviewers. |
PR#80319 added support to record combined construct semantics via an attribute. Add lowering support for this.