From aeed526e877c13566a2e68d9f4c670322e97c8d5 Mon Sep 17 00:00:00 2001 From: ergawy Date: Thu, 22 Feb 2024 06:21:51 -0600 Subject: [PATCH] [flang][OpenMP] Add support for `target ... private` Adds support for the `private` clause in the `target` directive. In order to support that, the `DataSharingProcessor` was also restructured in order to separate the collection of prviate symbols from their actual privatization code-gen. The commit adds both a code-gen and an offloading tests. --- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 10 ++- flang/lib/Lower/OpenMP/DataSharingProcessor.h | 41 ++++++++++--- flang/lib/Lower/OpenMP/OpenMP.cpp | 61 +++++++++++++------ flang/test/Lower/OpenMP/target_private.f90 | 30 +++++++++ .../offloading/fortran/target_private.f90 | 29 +++++++++ 5 files changed, 141 insertions(+), 30 deletions(-) create mode 100644 flang/test/Lower/OpenMP/target_private.f90 create mode 100644 openmp/libomptarget/test/offloading/fortran/target_private.f90 diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 717b8cc0276a30..b2c1c5ac0aa304 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -26,12 +26,20 @@ namespace omp { void DataSharingProcessor::processStep1() { collectSymbolsForPrivatization(); collectDefaultSymbols(); +} + +void DataSharingProcessor::processStep2() { + if (privatizationDone) + return; + privatize(); defaultPrivatize(); insertBarrier(); + + privatizationDone = true; } -void DataSharingProcessor::processStep2(mlir::Operation *op, bool isLoop) { +void DataSharingProcessor::processStep3(mlir::Operation *op, bool isLoop) { insPt = firOpBuilder.saveInsertionPoint(); copyLastPrivatize(op); firOpBuilder.restoreInsertionPoint(insPt); diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h index 9f7301df07598f..fc7e3570ad7f60 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h @@ -54,6 +54,8 @@ class DataSharingProcessor { fir::FirOpBuilder &firOpBuilder; const Fortran::parser::OmpClauseList &opClauseList; Fortran::lower::pft::Evaluation &eval; + bool privatizationDone = false; + bool useDelayedPrivatization; Fortran::lower::SymMap *symTable; DelayedPrivatizationInfo delayedPrivatizationInfo; @@ -90,25 +92,44 @@ class DataSharingProcessor { eval(eval), useDelayedPrivatization(useDelayedPrivatization), symTable(symTable) {} - // Privatisation is split into two steps. - // Step1 performs cloning of all privatisation clauses and copying for - // firstprivates. Step1 is performed at the place where process/processStep1 + // Privatisation is split into 3 steps: + // + // * Step1: collects all symbols that should be privatized. + // + // * Step2: performs cloning of all privatisation clauses and copying for + // firstprivates. Step2 is performed at the place where process/processStep2 // is called. This is usually inside the Operation corresponding to the OpenMP - // construct, for looping constructs this is just before the Operation. The - // split into two steps was performed basically to be able to call - // privatisation for looping constructs before the operation is created since - // the bounds of the MLIR OpenMP operation can be privatised. - // Step2 performs the copying for lastprivates and requires knowledge of the - // MLIR operation to insert the last private update. Step2 adds + // construct, for looping constructs this is just before the Operation. + // + // * Step3: performs the copying for lastprivates and requires knowledge of + // the MLIR operation to insert the last private update. Step3 adds // dealocation code as well. + // + // The split was performed for the following reasons: + // + // 1. Step1 was split so that the `target` op knows which symbols should not + // be mapped into the target region due to being `private`. The implicit + // mapping happens before the op body is generated so we need to to collect + // the private symbols first and then later in the body actually privatize + // them. + // + // 2. Step2 was split in order to call privatisation for looping constructs + // before the operation is created since the bounds of the MLIR OpenMP + // operation can be privatised. void processStep1(); - void processStep2(mlir::Operation *op, bool isLoop); + void processStep2(); + void processStep3(mlir::Operation *op, bool isLoop); void setLoopIV(mlir::Value iv) { assert(!loopIV && "Loop iteration variable already set"); loopIV = iv; } + const llvm::SetVector & + getPrivatizedSymbols() const { + return privatizedSymbols; + } + const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const { return delayedPrivatizationInfo; } diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index e499e16c19e046..573d962aebe10e 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -520,6 +520,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { if (!info.dsp) { tempDsp.emplace(info.converter, *info.clauses, info.eval); tempDsp->processStep1(); + tempDsp->processStep2(); } } @@ -593,11 +594,11 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { if (privatize) { if (!info.dsp) { assert(tempDsp.has_value()); - tempDsp->processStep2(op, isLoop); + tempDsp->processStep3(op, isLoop); } else { if (isLoop && regionArgs.size() > 0) info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0])); - info.dsp->processStep2(op, isLoop); + info.dsp->processStep3(op, isLoop); } } } @@ -816,8 +817,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter, DataSharingProcessor dsp(converter, clauseList, eval, /*useDelayedPrivatization=*/true, &symTable); - if (privatize) + if (privatize) { dsp.processStep1(); + dsp.processStep2(); + } + const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo(); @@ -1093,7 +1097,9 @@ static void genBodyOfTargetOp( const llvm::SmallVector &mapSymTypes, const llvm::SmallVector &mapSymLocs, const llvm::SmallVector &mapSymbols, - const mlir::Location ¤tLocation) { + const mlir::Location ¤tLocation, + const Fortran::parser::OmpClauseList &clauseList, + DataSharingProcessor &dsp) { assert(mapSymTypes.size() == mapSymLocs.size()); fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); @@ -1102,6 +1108,8 @@ static void genBodyOfTargetOp( auto *regionBlock = firOpBuilder.createBlock(®ion, {}, mapSymTypes, mapSymLocs); + dsp.processStep2(); + // Clones the `bounds` placing them inside the target region and returns them. auto cloneBound = [&](mlir::Value bound) { if (mlir::isMemoryEffectFree(bound.getDefiningOp())) { @@ -1243,7 +1251,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, bool genNested, mlir::Location currentLocation, const Fortran::parser::OmpClauseList &clauseList, - llvm::omp::Directive directive, bool outerCombined = false) { + llvm::omp::Directive directive, bool outerCombined = false, + DataSharingProcessor *dsp = nullptr) { Fortran::lower::StatementContext stmtCtx; mlir::Value ifClauseOperand, deviceOperand, threadLimitOperand; mlir::UnitAttr nowaitAttr; @@ -1262,9 +1271,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, cp.processDepend(dependTypeOperands, dependOperands); cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes, &mapSymLocs, &mapSymbols); - - cp.processTODO( currentLocation, llvm::omp::Directive::OMPD_target); + DataSharingProcessor localDSP(converter, clauseList, eval); + DataSharingProcessor &actualDSP = dsp ? *dsp : localDSP; + actualDSP.processStep1(); + // Process host-only clauses. if (!llvm::cast(*converter.getModuleOp()) .getIsTargetDevice()) @@ -1283,9 +1294,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, // 5.8.1 Implicit Data-Mapping Attribute Rules // The following code follows the implicit data-mapping rules to map all the - // symbols used inside the region that have not been explicitly mapped using - // the map clause. + // symbols used inside the region that do not have explicit data-environment + // attribute clauses (neither data-sharing; e.g. `private`, nor `map` + // clauses). auto captureImplicitMap = [&](const Fortran::semantics::Symbol &sym) { + if (actualDSP.getPrivatizedSymbols().contains(&sym)) { + return; + } + if (llvm::find(mapSymbols, &sym) == mapSymbols.end()) { mlir::Value baseOp = converter.getSymbolAddress(sym); if (!baseOp) @@ -1383,7 +1399,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, /*teams_thread_limit=*/nullptr, /*num_threads=*/nullptr); genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSymTypes, - mapSymLocs, mapSymbols, currentLocation); + mapSymLocs, mapSymbols, currentLocation, clauseList, + actualDSP); return targetOp; } @@ -1459,7 +1476,8 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, bool genNested, mlir::Location currentLocation, const Fortran::parser::OmpClauseList &clauseList, - bool outerCombined = false) { + bool outerCombined = false, + DataSharingProcessor *dsp = nullptr) { // TODO Process clauses // ClauseProcessor cp(converter, clauseList); // cp.processAllocate(allocatorOperands, allocateOperands); @@ -1469,7 +1487,8 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter, OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) .setGenNested(genNested) .setOuterCombined(outerCombined) - .setClauses(&clauseList), + .setClauses(&clauseList) + .setDataSharingProcessor(dsp), /*dist_schedule_static=*/nullptr, /*chunk_size=*/nullptr, /*allocate_vars=*/mlir::ValueRange(), @@ -1780,6 +1799,7 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter, fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); DataSharingProcessor dsp(converter, loopOpClauseList, eval); dsp.processStep1(); + dsp.processStep2(); Fortran::lower::StatementContext stmtCtx; mlir::Value scheduleChunkClauseOperand, ifClauseOperand; @@ -1836,10 +1856,10 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter, llvm::omp::Directive ompDirective, const Fortran::parser::OmpClauseList &beginClauseList, const Fortran::parser::OmpClauseList *endClauseList, - mlir::Location loc) { + mlir::Location loc, DataSharingProcessor &dsp) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - DataSharingProcessor dsp(converter, beginClauseList, eval); dsp.processStep1(); + dsp.processStep2(); Fortran::lower::StatementContext stmtCtx; mlir::Value scheduleChunkClauseOperand; @@ -1962,8 +1982,9 @@ static void createSimdWsLoop( // When support for vectorization is enabled, then we need to add handling of // if clause. Currently if clause can be skipped because we always assume // SIMD length = 1. + DataSharingProcessor dsp(converter, beginClauseList, eval); createWsLoop(converter, semaCtx, eval, ompDirective, beginClauseList, - endClauseList, loc); + endClauseList, loc, dsp); } static void genOMP(Fortran::lower::AbstractConverter &converter, @@ -1992,6 +2013,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, }(); bool validDirective = false; + DataSharingProcessor dsp(converter, loopOpClauseList, eval); + if (llvm::omp::topTaskloopSet.test(ompDirective)) { validDirective = true; TODO(currentLocation, "Taskloop construct"); @@ -2002,7 +2025,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, validDirective = true; genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, loopOpClauseList, ompDirective, - /*outerCombined=*/true); + /*outerCombined=*/true, &dsp); } if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet) .test(ompDirective)) { @@ -2014,7 +2037,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, validDirective = true; bool outerCombined = llvm::omp::topDistributeSet.test(ompDirective); genDistributeOp(converter, semaCtx, eval, /*genNested=*/false, - currentLocation, loopOpClauseList, outerCombined); + currentLocation, loopOpClauseList, outerCombined, &dsp); } if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet) .test(ompDirective)) { @@ -2045,7 +2068,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, genOpenMPReduction(converter, semaCtx, loopOpClauseList); } else { createWsLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList, - endClauseList, currentLocation); + endClauseList, currentLocation, dsp); } } diff --git a/flang/test/Lower/OpenMP/target_private.f90 b/flang/test/Lower/OpenMP/target_private.f90 new file mode 100644 index 00000000000000..98e3b79d035dfd --- /dev/null +++ b/flang/test/Lower/OpenMP/target_private.f90 @@ -0,0 +1,30 @@ +!Test data-sharing attribute clauses for the `target` directive. + +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPomp_target_private() +subroutine omp_target_private + implicit none + integer :: x(1) + +!$omp target private(x) + x(1) = 42 +!$omp end target +!CHECK: omp.target { +!CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +!CHECK-DAG: %[[PRIV_ALLOC:.*]] = fir.alloca !fir.array<1xi32> {bindc_name = "x", +!CHECK-SAME: pinned, uniq_name = "_QFomp_target_privateEx"} +!CHECK-NEXT: %[[SHAPE:.*]] = fir.shape %[[C1]] : (index) -> !fir.shape<1> +!CHECK-NEXT: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]](%[[SHAPE]]) +!CHECK-SAME: {uniq_name = "_QFomp_target_privateEx"} : +!CHECK-SAME: (!fir.ref>, !fir.shape<1>) -> +!CHECK-SAME: (!fir.ref>, !fir.ref>) +!CHECK-DAG: %[[C42:.*]] = arith.constant 42 : i32 +!CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +!CHECK-NEXT: %[[PRIV_BINDING:.*]] = hlfir.designate %[[PRIV_DECL]]#0 (%[[C1_2]]) +!CHECK-SAME: : (!fir.ref>, index) -> !fir.ref +!CHECK-NEXT: hlfir.assign %[[C42]] to %[[PRIV_BINDING]] : i32, !fir.ref +!CHECK-NEXT: omp.terminator +!CHECK-NEXT: } + +end subroutine omp_target_private diff --git a/openmp/libomptarget/test/offloading/fortran/target_private.f90 b/openmp/libomptarget/test/offloading/fortran/target_private.f90 new file mode 100644 index 00000000000000..486c23ec2ec8d2 --- /dev/null +++ b/openmp/libomptarget/test/offloading/fortran/target_private.f90 @@ -0,0 +1,29 @@ +! Basic offloading test with a target region +! REQUIRES: flang +! UNSUPPORTED: nvptx64-nvidia-cuda-LTO +! UNSUPPORTED: aarch64-unknown-linux-gnu +! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +! UNSUPPORTED: x86_64-pc-linux-gnu +! UNSUPPORTED: x86_64-pc-linux-gnu-LTO + +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +program target_update + implicit none + integer :: x(1) + integer :: y(1) + + x(1) = 42 + +!$omp target private(x) map(tofrom: y) + x(1) = 84 + y(1) = x(1) +!$omp end target + + print *, "x =", x(1) + print *, "y =", y(1) + +end program target_update +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: x = 42 +! CHECK: y = 84