Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][flang][OpenMP] Experimental pass to map do concurrent to OMP #77285

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion flang/include/flang/Optimizer/HLFIR/HLFIROps.td
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
func.func @foo(%arg0: !fir.ref<!fir.array<?x?x!fir.char<1,?>>>, %arg1: !fir.ref<i64>) {
%c10 = arith.constant 10 : index
%c20 = arith.constant 20 : index
%1 = fir.load %ag1 : fir.ref<i64>
%1 = fir.load %arg1 : fir.ref<i64>
%2 = fir.shape_shift %c10, %1, %c20, %1 : (index, index, index, index) -> !fir.shapeshift<2>
%3 = hfir.declare %arg0(%2) typeparams %1 {uniq_name = "c"} (fir.ref<!fir.array<?x?x!fir.char<1,?>>>, fir.shapeshift<2>, index) -> (fir.box<!fir.array<?x?x!fir.char<1,?>>>, fir.ref<!fir.array<?x?x!fir.char<1,?>>>)
// ... uses %3#0 as "c"
Expand Down
2 changes: 2 additions & 0 deletions flang/include/flang/Optimizer/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath,
bool noNaNsFPMath, bool approxFuncFPMath,
bool noSignedZerosFPMath, bool unsafeFPMath);

std::unique_ptr<mlir::Pass> createDoConcurrentConversionPass();

// declarative passes
#define GEN_PASS_REGISTRATION
#include "flang/Optimizer/Transforms/Passes.h.inc"
Expand Down
20 changes: 20 additions & 0 deletions flang/include/flang/Optimizer/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -397,4 +397,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
let constructor = "::fir::createFunctionAttrPass()";
}

def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> {
let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops.";

let description = [{ This is an experimental pass to map `DO CONCURRENR` loops
to their correspnding equivalent OpenMP worksharing constructs.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
to their correspnding equivalent OpenMP worksharing constructs.
to their corresponding equivalent OpenMP worksharing constructs.


For now the following is supported:
- Mapping simple loops to `parallel do`.

Still to TODO:
- More extensive testing.
- Mapping to `target teams distribute parallel do`.
- Allowing the user to control mapping behavior: either to the host or
target.
}];

let constructor = "::fir::createDoConcurrentConversionPass()";
let dependentDialects = ["mlir::omp::OpenMPDialect"];
}

#endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
1 change: 1 addition & 0 deletions flang/lib/Optimizer/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ add_flang_library(FIRTransforms
OMPMarkDeclareTarget.cpp
VScaleAttr.cpp
FunctionAttr.cpp
DoConcurrentConversion.cpp

DEPENDS
FIRDialect
Expand Down
205 changes: 205 additions & 0 deletions flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"

#include <memory>

namespace fir {
#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"

namespace {
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
public:
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;

mlir::LogicalResult
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
mlir::OpPrintingFlags flags;
flags.printGenericOpForm();

mlir::omp::ParallelOp parallelOp =
rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc());

mlir::Block *block = rewriter.createBlock(&parallelOp.getRegion());

rewriter.setInsertionPointToEnd(block);
rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc());

rewriter.setInsertionPointToStart(block);

// ==== TODO (1) Start ====
//
// The goal of the few lines below is to collect and clone
// the list of operations that define the loop's lower and upper bounds as
// well as the step. Should we, instead of doing this here, split it into 2
// stages?
//
// 1. **Stage 1**: add an analysis that extracts all the relevant
// operations defining the lower-bound, upper-bound, and
// step.
// 2. **Stage 2**: clone the collected operations in the parallel region.
//
// So far, the pass has been tested with very simple loops (where the bounds
// and step are constants) so the goal of **Stage 1** is to have a
// well-defined component that has the sole responsibility of collecting all
// the relevant ops relevant to the loop header. This was we can test this
// in isolation for more complex loops and better organize the code. **Stage
// 2** would then be responsible for the actual cloning of the collected
// loop header preparation/allocation operations.

// Clone the LB, UB, step defining ops inside the parallel region.
llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
lowerBound.push_back(
rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0));
upperBound.push_back(
rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
step.push_back(
rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
// ==== TODO (1) End ====

auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
doLoop.getLoc(), lowerBound, upperBound, step);
wsLoopOp.setInclusive(true);

auto outlineableOp =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp);
rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock());

// ==== TODO (2) Start ====
//
// The goal of the following simple work-list algorithm and
// the following `for` loop is to collect all the operations related to the
// allocation of the induction variable for the `do concurrent` loop. The
// operations collected by this algorithm are very similar to what is
// usually emitted for privatized variables, e.g. for omp.parallel loops.
// Therefore, I think we can:
//
// 1. **Stage 1**: Add an analysis that colects all these operations. The
// goal is similar to **Stage 1** of TODO (1): isolate the
// algorithm is an individually-testable component so that
// we properly implement and test it for more complicated
// `do concurrent` loops.
// 1. **Stage 2**: Using the collected operations, create and populate an
// `omp.private {type=private}` op to server as the
// delayed privatizer for the new work-sharing loop.

// For the induction variable, we need to privative its allocation and
// binding inside the parallel region.
llvm::SmallSetVector<mlir::Operation *, 2> workList;
// Therefore, we first discover the induction variable by discovering
// `fir.store`s where the source is the loop's block argument.
workList.insert(doLoop.getInductionVar().getUsers().begin(),
doLoop.getInductionVar().getUsers().end());
llvm::SmallSetVector<fir::StoreOp, 2> inductionVarTargetStores;

// Walk the def-chain of the loop's block argument until we hit `fir.store`.
while (!workList.empty()) {
mlir::Operation *item = workList.front();

if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(item)) {
inductionVarTargetStores.insert(storeOp);
} else {
workList.insert(item->getUsers().begin(), item->getUsers().end());
}

workList.remove(item);
}

// For each collected `fir.sotre`, find the target memref's alloca's and
// declare ops.
llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone;
for (auto storeOp : inductionVarTargetStores) {
mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp();

for (auto operand : storeTarget->getOperands()) {
declareAndAllocasToClone.insert(operand.getDefiningOp());
}
declareAndAllocasToClone.insert(storeTarget);
}
// ==== TODO (2) End ====
//
// TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can
// more easily generalize the pass to work for targets other than OpenMP,
// e.g. OpenACC, I think can, can reuse the results of the analyses and only
// change the code-gen/rewriting.

mlir::IRMapping mapper;

// Collect the memref defining ops in the parallel region.
for (mlir::Operation *opToClone : declareAndAllocasToClone) {
rewriter.clone(*opToClone, mapper);
}

// Clone the loop's body inside the worksharing construct using the mapped
// memref values.
rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(),
wsLoopOp.getRegion().begin(), mapper);

mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator();
rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back());
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
rewriter.eraseOp(terminator);

rewriter.eraseOp(doLoop);

return mlir::success();
}
};

class DoConcurrentConversionPass
: public fir::impl::DoConcurrentConversionPassBase<
DoConcurrentConversionPass> {
public:
void runOnOperation() override {
mlir::func::FuncOp func = getOperation();

if (func.isDeclaration()) {
return;
}

auto *context = &getContext();
mlir::RewritePatternSet patterns(context);
patterns.insert<DoConcurrentConversion>(context);
mlir::ConversionTarget target(*context);
target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
mlir::arith::ArithDialect, mlir::func::FuncDialect,
mlir::omp::OpenMPDialect>();

target.addDynamicallyLegalOp<fir::DoLoopOp>(
[](fir::DoLoopOp op) { return !op.getUnordered(); });

if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(context),
"error in converting do-concurrent op");
signalPassFailure();
}
}
};
} // namespace

std::unique_ptr<mlir::Pass> fir::createDoConcurrentConversionPass() {
return std::make_unique<DoConcurrentConversionPass>();
}
60 changes: 60 additions & 0 deletions flang/test/Transforms/DoConcurrent/basic.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.

// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s

// CHECK-LABEL: func.func @do_concurrent_basic
func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
// CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
// CHECK: %[[C1:.*]] = arith.constant 1 : i32
// CHECK: %[[C10:.*]] = arith.constant 10 : i32

%0 = fir.alloca i32 {bindc_name = "i"}
%1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
%2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
%c10 = arith.constant 10 : index
%3 = fir.shape %c10 : (index) -> !fir.shape<1>
%4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
%c1_i32 = arith.constant 1 : i32
%7 = fir.convert %c1_i32 : (i32) -> index
%c10_i32 = arith.constant 10 : i32
%8 = fir.convert %c10_i32 : (i32) -> index
%c1 = arith.constant 1 : index

// CHECK-NOT: fir.do_loop

// CHECK: omp.parallel {

// CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
// CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)

// CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
// CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
// CHECK: %[[STEP:.*]] = arith.constant 1 : index

// CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
// CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
// CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
// CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
// CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
// CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
// CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
// CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
// CHECK-NEXT: omp.yield
// CHECK-NEXT: }

// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
%13 = fir.convert %arg0 : (index) -> i32
fir.store %13 to %1#1 : !fir.ref<i32>
%14 = fir.load %1#0 : !fir.ref<i32>
%15 = fir.load %1#0 : !fir.ref<i32>
%16 = fir.convert %15 : (i32) -> i64
%17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
hlfir.assign %14 to %17 : i32, !fir.ref<i32>
}

// CHECK-NOT: fir.do_loop

return
}
Loading