From 3be535d7404b4659d2d0015da589a6ddc6cb5896 Mon Sep 17 00:00:00 2001 From: ergawy Date: Thu, 12 Jun 2025 06:14:21 -0500 Subject: [PATCH 1/3] [flang][do concurrent] Re-model `reduce` to match reductions are modelled in OpenMP and OpenACC This PR proposes re-modelling `reduce` specifiers to match OpenMP and OpenACC. In particular, this PR includes the following: * A new `fir` op: `fir.delcare_reduction` which is identical to OpenMP's `omp.declare_reduction` op. * Updating the `reduce` clause on `fir.do_concurrent.loop` to use the new op. * Re-uses the `ReductionProcessor` component to emit reductions for `do concurrent` just like we do for OpenMP. To do this, the `ReductionProcessor` had to be refactored to be more generalized. * Upates mapping `do concurrent` to `fir.loop ... unordered` nests using the new reduction model. Unfortunately, this is a big PR that would be difficult to divide up in smaller parts because the bottom of the changes are the `fir` table-gen changes to `do concurrent`. However, doing these MLIR changes cascades to the other parts that have to be modified to not break things. --- .../flang/Optimizer/Dialect/FIRAttr.td | 2 +- .../include/flang/Optimizer/Dialect/FIROps.td | 136 +++++++- flang/lib/Lower/Bridge.cpp | 85 +++-- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 79 ++++- flang/lib/Lower/OpenMP/ClauseProcessor.h | 1 - flang/lib/Lower/OpenMP/Clauses.h | 1 + flang/lib/Lower/OpenMP/OpenMP.cpp | 1 - flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 282 +++++++++-------- flang/lib/Lower/OpenMP/ReductionProcessor.h | 30 +- flang/lib/Lower/Support/Utils.cpp | 4 +- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 48 +-- flang/lib/Optimizer/Dialect/FIROps.cpp | 293 ++++++++++++------ .../Transforms/SimplifyFIROperations.cpp | 10 +- flang/test/Fir/do_concurrent.fir | 63 +++- flang/test/Fir/invalid.fir | 18 +- flang/test/Lower/do_concurrent_reduce.f90 | 41 +++ flang/test/Lower/loops.f90 | 2 +- flang/test/Lower/loops3.f90 | 2 +- .../do_concurrent-to-do_loop-unodered.fir | 2 +- 19 files changed, 780 insertions(+), 320 deletions(-) create mode 100644 flang/test/Lower/do_concurrent_reduce.f90 diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td index 2845080030b92..7bd96ac3ea631 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td @@ -112,7 +112,7 @@ def fir_ReduceOperationEnum : I32BitEnumAttr<"ReduceOperationEnum", I32BitEnumAttrCaseBit<"MIN", 7, "min">, I32BitEnumAttrCaseBit<"IAND", 8, "iand">, I32BitEnumAttrCaseBit<"IOR", 9, "ior">, - I32BitEnumAttrCaseBit<"EIOR", 10, "eior"> + I32BitEnumAttrCaseBit<"IEOR", 10, "ieor"> ]> { let separator = ", "; let cppNamespace = "::fir"; diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index b96c0360316ad..b217f1433767f 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3518,7 +3518,7 @@ def fir_BoxTotalElementsOp def YieldOp : fir_Op<"yield", [Pure, ReturnLike, Terminator, - ParentOneOf<["LocalitySpecifierOp"]>]> { + ParentOneOf<["LocalitySpecifierOp", "DeclareReductionOp"]>]> { let summary = "loop yield and termination operation"; let description = [{ "fir.yield" yields SSA values from a fir dialect op region and @@ -3662,6 +3662,103 @@ def fir_LocalitySpecifierOp : fir_Op<"local", [IsolatedFromAbove]> { let hasRegionVerifier = 1; } +def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, + Symbol]> { + let summary = "declares a reduction kind"; + let description = [{ + Note: this operation is adapted from omp::DeclareReductionOp. There is a lot + duplication at the moment. TODO Combined both ops into one. See: + https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108. + + Declares an `do concurrent` reduction. This requires two mandatory and three + optional regions. + + 1. The optional alloc region specifies how to allocate the thread-local + reduction value. This region should not contain control flow and all + IR should be suitable for inlining straight into an entry block. In + the common case this is expected to contain only allocas. It is + expected to `fir.yield` the allocated value on all control paths. + If allocation is conditional (e.g. only allocate if the mold is + allocated), this should be done in the initilizer region and this + region not included. The alloc region is not used for by-value + reductions (where allocation is implicit). + 2. The initializer region specifies how to initialize the thread-local + reduction value. This is usually the neutral element of the reduction. + For convenience, the region has an argument that contains the value + of the reduction accumulator at the start of the reduction. If an alloc + region is specified, there is a second block argument containing the + address of the allocated memory. The initializer region is expected to + `fir.yield` the new value on all control flow paths. + 3. The reduction region specifies how to combine two values into one, i.e. + the reduction operator. It accepts the two values as arguments and is + expected to `fir.yield` the combined value on all control flow paths. + 4. The atomic reduction region is optional and specifies how two values + can be combined atomically given local accumulator variables. It is + expected to store the combined value in the first accumulator variable. + 5. The cleanup region is optional and specifies how to clean up any memory + allocated by the initializer region. The region has an argument that + contains the value of the thread-local reduction accumulator. This will + be executed after the reduction has completed. + + Note that the MLIR type system does not allow for type-polymorphic + reductions. Separate reduction declarations should be created for different + element and accumulator types. + + For initializer and reduction regions, the operand to `fir.yield` must + match the parent operation's results. + }]; + + let arguments = (ins SymbolNameAttr:$sym_name, + TypeAttr:$type); + + let regions = (region MaxSizedRegion<1>:$allocRegion, + AnyRegion:$initializerRegion, + AnyRegion:$reductionRegion, + AnyRegion:$atomicReductionRegion, + AnyRegion:$cleanupRegion); + + let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword " + "( `alloc` $allocRegion^ )? " + "`init` $initializerRegion " + "`combiner` $reductionRegion " + "( `atomic` $atomicReductionRegion^ )? " + "( `cleanup` $cleanupRegion^ )? "; + + let extraClassDeclaration = [{ + mlir::BlockArgument getAllocMoldArg() { + auto ®ion = getAllocRegion(); + return region.empty() ? nullptr : region.getArgument(0); + } + mlir::BlockArgument getInitializerMoldArg() { + return getInitializerRegion().getArgument(0); + } + mlir::BlockArgument getInitializerAllocArg() { + return getAllocRegion().empty() ? + nullptr : getInitializerRegion().getArgument(1); + } + mlir::BlockArgument getReductionLhsArg() { + return getReductionRegion().getArgument(0); + } + mlir::BlockArgument getReductionRhsArg() { + return getReductionRegion().getArgument(1); + } + mlir::BlockArgument getAtomicReductionLhsArg() { + auto ®ion = getAtomicReductionRegion(); + return region.empty() ? nullptr : region.getArgument(0); + } + mlir::BlockArgument getAtomicReductionRhsArg() { + auto ®ion = getAtomicReductionRegion(); + return region.empty() ? nullptr : region.getArgument(1); + } + mlir::BlockArgument getCleanupAllocArg() { + auto ®ion = getCleanupRegion(); + return region.empty() ? nullptr : region.getArgument(0); + } + }]; + + let hasRegionVerifier = 1; +} + def fir_DoConcurrentOp : fir_Op<"do_concurrent", [SingleBlock, AutomaticAllocationScope]> { let summary = "do concurrent loop wrapper"; @@ -3700,6 +3797,25 @@ def fir_LocalSpecifier { ); } +def fir_ReduceSpecifier { + dag arguments = (ins + Variadic:$reduce_vars, + OptionalAttr:$reduce_byref, + + // This introduces redundency in how reductions are modelled. In particular, + // a single reduction is represented by 2 attributes: + // + // 1. `$reduce_syms` which is a list of `DeclareReductionOp`s. + // 2. `$reduce_attrs` which is an array of `fir::ReduceAttr` values. + // + // The first makes it easier to map `do concurrent` to parallization models + // (e.g. OpenMP and OpenACC) while the second makes it easier to map it to + // nests of `fir.do_loop ... unodered` ops. + OptionalAttr:$reduce_syms, + OptionalAttr:$reduce_attrs + ); +} + def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop", [AttrSizedOperandSegments, DeclareOpInterfaceMethods, @@ -3709,7 +3825,7 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop", let description = [{ An operation that models a Fortran `do concurrent` loop's header and block. This is a single-region single-block terminator op that is expected to - terminate the region of a `omp.do_concurrent` wrapper op. + terminate the region of a `fir.do_concurrent` wrapper op. This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to `scf.parallel`, a loop nest takes 3 groups of SSA values as operands that @@ -3747,8 +3863,6 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop", - `lowerBound`: The group of SSA values for the nest's lower bounds. - `upperBound`: The group of SSA values for the nest's upper bounds. - `step`: The group of SSA values for the nest's steps. - - `reduceOperands`: The reduction SSA values, if any. - - `reduceAttrs`: Attributes to store reduction operations, if any. - `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to LLVM. }]; @@ -3757,12 +3871,12 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop", Variadic:$lowerBound, Variadic:$upperBound, Variadic:$step, - Variadic:$reduceOperands, - OptionalAttr:$reduceAttrs, OptionalAttr:$loopAnnotation ); - let arguments = !con(opArgs, fir_LocalSpecifier.arguments); + let arguments = !con(opArgs, + fir_LocalSpecifier.arguments, + fir_ReduceSpecifier.arguments); let regions = (region SizedRegion<1>:$region); @@ -3783,12 +3897,18 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop", getNumLocalOperands()); } + mlir::Block::BlockArgListType getRegionReduceArgs() { + return getBody()->getArguments().slice(getNumInductionVars() + + getNumLocalOperands(), + getNumReduceOperands()); + } + /// Number of operands controlling the loop unsigned getNumControlOperands() { return getLowerBound().size() * 3; } // Get Number of reduction operands unsigned getNumReduceOperands() { - return getReduceOperands().size(); + return getReduceVars().size(); } mlir::Operation::operand_range getLocalOperands() { diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 7b640dd497af3..bc2b9f2fcb48a 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -12,6 +12,7 @@ #include "flang/Lower/Bridge.h" +#include "OpenMP/ReductionProcessor.h" #include "flang/Lower/Allocatable.h" #include "flang/Lower/CallInterface.h" #include "flang/Lower/Coarray.h" @@ -127,9 +128,8 @@ struct IncrementLoopInfo { bool isConcurrent; llvm::SmallVector localSymList; llvm::SmallVector localInitSymList; - llvm::SmallVector< - std::pair> - reduceSymList; + llvm::SmallVector reduceSymList; + llvm::SmallVector reduceOperatorList; llvm::SmallVector sharedSymList; mlir::Value loopVariable = nullptr; @@ -1980,7 +1980,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { case Fortran::parser::ReductionOperator::Operator::Ior: return fir::ReduceOperationEnum::IOR; case Fortran::parser::ReductionOperator::Operator::Ieor: - return fir::ReduceOperationEnum::EIOR; + return fir::ReduceOperationEnum::IEOR; } llvm_unreachable("illegal reduction operator"); } @@ -2014,8 +2014,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { std::get(reduceList->t)); for (const Fortran::parser::Name &x : std::get>(reduceList->t)) { - info.reduceSymList.push_back( - std::make_pair(reduce_operation, x.symbol)); + info.reduceSymList.push_back(x.symbol); + info.reduceOperatorList.push_back(reduce_operation); } } } @@ -2076,6 +2076,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { assign.u = Fortran::evaluate::Assignment::BoundsSpec{}; genAssignment(assign); } + for (const Fortran::semantics::Symbol *sym : info.sharedSymList) { const auto *hostDetails = sym->detailsIf(); @@ -2099,6 +2100,46 @@ class FirConverter : public Fortran::lower::AbstractConverter { } } + llvm::SmallVector reduceVarByRef; + llvm::SmallVector reductionDeclSymbols; + llvm::SmallVector nestReduceAttrs; + + for (const auto &reduceOp : info.reduceOperatorList) { + nestReduceAttrs.push_back( + fir::ReduceAttr::get(builder->getContext(), reduceOp)); + } + + llvm::SmallVector reduceVars; + Fortran::lower::omp::ReductionProcessor rp; + rp.processReductionArguments( + toLocation(), *this, info.reduceOperatorList, reduceVars, + reduceVarByRef, reductionDeclSymbols, info.reduceSymList); + + doConcurrentLoopOp.getReduceVarsMutable().assign(reduceVars); + doConcurrentLoopOp.setReduceSymsAttr( + reductionDeclSymbols.empty() + ? nullptr + : mlir::ArrayAttr::get(builder->getContext(), + reductionDeclSymbols)); + doConcurrentLoopOp.setReduceAttrsAttr( + nestReduceAttrs.empty() + ? nullptr + : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs)); + doConcurrentLoopOp.setReduceByrefAttr( + reduceVarByRef.empty() ? nullptr + : mlir::DenseBoolArrayAttr::get( + builder->getContext(), reduceVarByRef)); + + for (auto [sym, reduceVar] : + llvm::zip_equal(info.reduceSymList, reduceVars)) { + auto arg = doConcurrentLoopOp.getRegion().begin()->addArgument( + reduceVar.getType(), doConcurrentLoopOp.getLoc()); + bindSymbol(*sym, hlfir::translateToExtendedValue( + reduceVar.getLoc(), *builder, hlfir::Entity{arg}, + /*contiguousHint=*/true) + .first); + } + // Note that allocatable, types with ultimate components, and type // requiring finalization are forbidden in LOCAL/LOCAL_INIT (F2023 C1130), // so no clean-up needs to be generated for these entities. @@ -2190,6 +2231,12 @@ class FirConverter : public Fortran::lower::AbstractConverter { } } + // Introduce a `do concurrent` scope to bind symbols corresponding to local, + // local_init, and reduce region arguments. + if (!incrementLoopNestInfo.empty() && + incrementLoopNestInfo.back().isConcurrent) + localSymbols.pushScope(); + // Increment loop begin code. (Infinite/while code was already generated.) if (!infiniteLoop && !whileCondition) genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs); @@ -2213,6 +2260,10 @@ class FirConverter : public Fortran::lower::AbstractConverter { // This call may generate a branch in some contexts. genFIR(endDoEval, unstructuredContext); + + if (!incrementLoopNestInfo.empty() && + incrementLoopNestInfo.back().isConcurrent) + localSymbols.popScope(); } /// Generate FIR to evaluate loop control values (lower, upper and step). @@ -2395,19 +2446,6 @@ class FirConverter : public Fortran::lower::AbstractConverter { info.stepVariable = builder->createTemporary(loc, stepValue.getType()); builder->create(loc, stepValue, info.stepVariable); } - - if (genDoConcurrent && nestReduceOperands.empty()) { - // Create DO CONCURRENT reduce operands and attributes - for (const auto &reduceSym : info.reduceSymList) { - const fir::ReduceOperationEnum reduceOperation = reduceSym.first; - const Fortran::semantics::Symbol *sym = reduceSym.second; - fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr); - nestReduceOperands.push_back(fir::getBase(exv)); - auto reduceAttr = - fir::ReduceAttr::get(builder->getContext(), reduceOperation); - nestReduceAttrs.push_back(reduceAttr); - } - } } for (auto [info, lowerValue, upperValue, stepValue] : @@ -2505,11 +2543,10 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->setInsertionPointToEnd(loopWrapperOp.getBody()); auto loopOp = builder->create( - loc, nestLBs, nestUBs, nestSts, nestReduceOperands, - nestReduceAttrs.empty() - ? nullptr - : mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs), - nullptr, /*local_vars=*/std::nullopt, /*local_syms=*/nullptr); + loc, nestLBs, nestUBs, nestSts, nullptr, /*local_vars=*/std::nullopt, + /*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt, + /*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr, + /*reduce_attrs=*/nullptr); llvm::SmallVector loopBlockArgTypes( incrementLoopNestInfo.size(), builder->getIndexType()); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 7bea427099a28..5aebfc901e8ac 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -12,6 +12,7 @@ #include "ClauseProcessor.h" #include "Clauses.h" +#include "ReductionProcessor.h" #include "Utils.h" #include "flang/Lower/ConvertExprToHLFIR.h" @@ -25,6 +26,21 @@ namespace Fortran { namespace lower { namespace omp { +using ReductionModifier = + Fortran::lower::omp::clause::Reduction::ReductionModifier; + +mlir::omp::ReductionModifier translateReductionModifier(ReductionModifier mod) { + switch (mod) { + case ReductionModifier::Default: + return mlir::omp::ReductionModifier::defaultmod; + case ReductionModifier::Inscan: + return mlir::omp::ReductionModifier::inscan; + case ReductionModifier::Task: + return mlir::omp::ReductionModifier::task; + } + return mlir::omp::ReductionModifier::defaultmod; +} + /// Check for unsupported map operand types. static void checkMapType(mlir::Location location, mlir::Type type) { if (auto refType = mlir::dyn_cast(type)) @@ -1076,6 +1092,18 @@ bool ClauseProcessor::processIf( }); return found; } + +template +void collectReductionSyms( + const T &reduction, + llvm::SmallVectorImpl &reductionSyms) { + const auto &objectList{std::get(reduction.t)}; + for (const Object &object : objectList) { + const semantics::Symbol *symbol = object.sym(); + reductionSyms.push_back(symbol); + } +} + bool ClauseProcessor::processInReduction( mlir::Location currentLocation, mlir::omp::InReductionClauseOps &result, llvm::SmallVectorImpl &outReductionSyms) const { @@ -1085,10 +1113,14 @@ bool ClauseProcessor::processInReduction( llvm::SmallVector inReduceVarByRef; llvm::SmallVector inReductionDeclSymbols; llvm::SmallVector inReductionSyms; + collectReductionSyms(clause, inReductionSyms); + ReductionProcessor rp; - rp.processReductionArguments( - currentLocation, converter, clause, inReductionVars, - inReduceVarByRef, inReductionDeclSymbols, inReductionSyms); + rp.processReductionArguments( + currentLocation, converter, + std::get(clause.t), + inReductionVars, inReduceVarByRef, inReductionDeclSymbols, + inReductionSyms); // Copy local lists into the output. llvm::copy(inReductionVars, std::back_inserter(result.inReductionVars)); @@ -1416,10 +1448,23 @@ bool ClauseProcessor::processReduction( llvm::SmallVector reduceVarByRef; llvm::SmallVector reductionDeclSymbols; llvm::SmallVector reductionSyms; + collectReductionSyms(clause, reductionSyms); + + auto mod = std::get>(clause.t); + if (mod.has_value()) { + if (mod.value() == ReductionModifier::Task) + TODO(currentLocation, "Reduction modifier `task` is not supported"); + else + result.reductionMod = mlir::omp::ReductionModifierAttr::get( + converter.getFirOpBuilder().getContext(), + translateReductionModifier(mod.value())); + } + ReductionProcessor rp; - rp.processReductionArguments( - currentLocation, converter, clause, reductionVars, reduceVarByRef, - reductionDeclSymbols, reductionSyms, &result.reductionMod); + rp.processReductionArguments( + currentLocation, converter, + std::get(clause.t), + reductionVars, reduceVarByRef, reductionDeclSymbols, reductionSyms); // Copy local lists into the output. llvm::copy(reductionVars, std::back_inserter(result.reductionVars)); llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref)); @@ -1435,21 +1480,25 @@ bool ClauseProcessor::processTaskReduction( return findRepeatableClause( [&](const omp::clause::TaskReduction &clause, const parser::CharBlock &) { llvm::SmallVector taskReductionVars; - llvm::SmallVector TaskReduceVarByRef; - llvm::SmallVector TaskReductionDeclSymbols; - llvm::SmallVector TaskReductionSyms; + llvm::SmallVector taskReduceVarByRef; + llvm::SmallVector taskReductionDeclSymbols; + llvm::SmallVector taskReductionSyms; + collectReductionSyms(clause, taskReductionSyms); + ReductionProcessor rp; - rp.processReductionArguments( - currentLocation, converter, clause, taskReductionVars, - TaskReduceVarByRef, TaskReductionDeclSymbols, TaskReductionSyms); + rp.processReductionArguments( + currentLocation, converter, + std::get(clause.t), + taskReductionVars, taskReduceVarByRef, taskReductionDeclSymbols, + taskReductionSyms); // Copy local lists into the output. llvm::copy(taskReductionVars, std::back_inserter(result.taskReductionVars)); - llvm::copy(TaskReduceVarByRef, + llvm::copy(taskReduceVarByRef, std::back_inserter(result.taskReductionByref)); - llvm::copy(TaskReductionDeclSymbols, + llvm::copy(taskReductionDeclSymbols, std::back_inserter(result.taskReductionSyms)); - llvm::copy(TaskReductionSyms, std::back_inserter(outReductionSyms)); + llvm::copy(taskReductionSyms, std::back_inserter(outReductionSyms)); }); } diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 3d8c4a337a4a4..46b749fb66c86 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -14,7 +14,6 @@ #include "ClauseFinder.h" #include "Clauses.h" -#include "ReductionProcessor.h" #include "Utils.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/Bridge.h" diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h index d7ab21d428e32..7f317f05f67b7 100644 --- a/flang/lib/Lower/OpenMP/Clauses.h +++ b/flang/lib/Lower/OpenMP/Clauses.h @@ -179,6 +179,7 @@ using IteratorSpecifier = tomp::type::IteratorSpecifierT; using DefinedOperator = tomp::type::DefinedOperatorT; using ProcedureDesignator = tomp::type::ProcedureDesignatorT; using ReductionOperator = tomp::type::ReductionIdentifierT; +using ReductionOperatorList = List; using DependenceType = tomp::type::DependenceType; using Prescriptiveness = tomp::type::Prescriptiveness; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 96daadfaf1b3b..274091329db44 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -17,7 +17,6 @@ #include "Clauses.h" #include "DataSharingProcessor.h" #include "Decomposer.h" -#include "ReductionProcessor.h" #include "Utils.h" #include "flang/Common/idioms.h" #include "flang/Lower/Bridge.h" diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index d7f520e86e532..7cd5bc7b4928c 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -12,6 +12,7 @@ #include "ReductionProcessor.h" +#include "Clauses.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/ConvertType.h" #include "flang/Lower/Support/PrivateReductionUtils.h" @@ -21,8 +22,6 @@ #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" -#include "flang/Optimizer/Support/FatalError.h" -#include "flang/Parser/tools.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "llvm/Support/CommandLine.h" #include @@ -40,35 +39,35 @@ namespace lower { namespace omp { // explicit template declarations -template void -ReductionProcessor::processReductionArguments( +template void ReductionProcessor::processReductionArguments< + mlir::omp::DeclareReductionOp, omp::clause::ReductionOperatorList>( mlir::Location currentLocation, lower::AbstractConverter &converter, - const omp::clause::Reduction &reduction, + const omp::clause::ReductionOperatorList &redOperatorList, llvm::SmallVectorImpl &reductionVars, llvm::SmallVectorImpl &reduceVarByRef, llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl &reductionSymbols, - mlir::omp::ReductionModifierAttr *reductionMod); + const llvm::SmallVectorImpl &reductionSymbols); -template void -ReductionProcessor::processReductionArguments( +template void ReductionProcessor::processReductionArguments< + fir::DeclareReductionOp, llvm::SmallVector>( mlir::Location currentLocation, lower::AbstractConverter &converter, - const omp::clause::TaskReduction &reduction, + const llvm::SmallVector &redOperatorList, llvm::SmallVectorImpl &reductionVars, llvm::SmallVectorImpl &reduceVarByRef, llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl &reductionSymbols, - mlir::omp::ReductionModifierAttr *reductionMod); + const llvm::SmallVectorImpl &reductionSymbols); -template void -ReductionProcessor::processReductionArguments( - mlir::Location currentLocation, lower::AbstractConverter &converter, - const omp::clause::InReduction &reduction, - llvm::SmallVectorImpl &reductionVars, - llvm::SmallVectorImpl &reduceVarByRef, - llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl &reductionSymbols, - mlir::omp::ReductionModifierAttr *reductionMod); +template mlir::omp::DeclareReductionOp +ReductionProcessor::createDeclareReduction( + AbstractConverter &converter, llvm::StringRef reductionOpName, + const ReductionIdentifier redId, mlir::Type type, mlir::Location loc, + bool isByRef); + +template fir::DeclareReductionOp +ReductionProcessor::createDeclareReduction( + AbstractConverter &converter, llvm::StringRef reductionOpName, + const ReductionIdentifier redId, mlir::Type type, mlir::Location loc, + bool isByRef); ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType( const omp::clause::ProcedureDesignator &pd) { @@ -106,6 +105,37 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType( } } +ReductionProcessor::ReductionIdentifier +ReductionProcessor::getReductionType(const fir::ReduceOperationEnum &redOp) { + switch (redOp) { + case fir::ReduceOperationEnum::Add: + return ReductionIdentifier::ADD; + case fir::ReduceOperationEnum::Multiply: + return ReductionIdentifier::MULTIPLY; + + case fir::ReduceOperationEnum::AND: + return ReductionIdentifier::AND; + case fir::ReduceOperationEnum::OR: + return ReductionIdentifier::OR; + + case fir::ReduceOperationEnum::EQV: + return ReductionIdentifier::EQV; + case fir::ReduceOperationEnum::NEQV: + return ReductionIdentifier::NEQV; + + case fir::ReduceOperationEnum::IAND: + return ReductionIdentifier::IAND; + case fir::ReduceOperationEnum::IEOR: + return ReductionIdentifier::IEOR; + case fir::ReduceOperationEnum::IOR: + return ReductionIdentifier::IOR; + case fir::ReduceOperationEnum::MAX: + return ReductionIdentifier::MAX; + case fir::ReduceOperationEnum::MIN: + return ReductionIdentifier::MIN; + } +} + bool ReductionProcessor::supportedIntrinsicProcReduction( const omp::clause::ProcedureDesignator &pd) { semantics::Symbol *sym = pd.v.sym(); @@ -136,25 +166,26 @@ ReductionProcessor::getReductionName(llvm::StringRef name, return fir::getTypeAsString(ty, kindMap, (name + byrefAddition).str()); } -std::string ReductionProcessor::getReductionName( - omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp, - const fir::KindMapping &kindMap, mlir::Type ty, bool isByRef) { +std::string +ReductionProcessor::getReductionName(ReductionIdentifier redId, + const fir::KindMapping &kindMap, + mlir::Type ty, bool isByRef) { std::string reductionName; - switch (intrinsicOp) { - case omp::clause::DefinedOperator::IntrinsicOperator::Add: + switch (redId) { + case ReductionIdentifier::ADD: reductionName = "add_reduction"; break; - case omp::clause::DefinedOperator::IntrinsicOperator::Multiply: + case ReductionIdentifier::MULTIPLY: reductionName = "multiply_reduction"; break; - case omp::clause::DefinedOperator::IntrinsicOperator::AND: + case ReductionIdentifier::AND: return "and_reduction"; - case omp::clause::DefinedOperator::IntrinsicOperator::EQV: + case ReductionIdentifier::EQV: return "eqv_reduction"; - case omp::clause::DefinedOperator::IntrinsicOperator::OR: + case ReductionIdentifier::OR: return "or_reduction"; - case omp::clause::DefinedOperator::IntrinsicOperator::NEQV: + case ReductionIdentifier::NEQV: return "neqv_reduction"; default: reductionName = "other_reduction"; @@ -330,8 +361,18 @@ mlir::Value ReductionProcessor::createScalarCombiner( return reductionOp; } +template +static void genYield(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value yieldedValue) { + if constexpr (std::is_same_v) + builder.create(loc, yieldedValue); + else + builder.create(loc, yieldedValue); +} + /// Create reduction combiner region for reduction variables which are boxed /// arrays +template static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, ReductionProcessor::ReductionIdentifier redId, fir::BaseBoxType boxTy, mlir::Value lhs, @@ -365,7 +406,7 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value result = ReductionProcessor::createScalarCombiner( builder, loc, redId, eleTy, lhs, rhs); builder.create(loc, result, lhsValAddr); - builder.create(loc, lhsAddr); + genYield(builder, loc, lhsAddr); return; } @@ -404,10 +445,11 @@ static void genBoxCombiner(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, scalarReduction, lhsEleAddr); builder.setInsertionPointAfter(nest.outerOp); - builder.create(loc, lhsAddr); + genYield(builder, loc, lhsAddr); } // generate combiner region for reduction operations +template static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc, ReductionProcessor::ReductionIdentifier redId, mlir::Type ty, mlir::Value lhs, mlir::Value rhs, @@ -422,15 +464,15 @@ static void genCombiner(fir::FirOpBuilder &builder, mlir::Location loc, builder, loc, redId, ty, lhsLoaded, rhsLoaded); if (isByRef) { builder.create(loc, result, lhs); - builder.create(loc, lhs); + genYield(builder, loc, lhs); } else { - builder.create(loc, result); + genYield(builder, loc, result); } return; } // all arrays should have been boxed if (auto boxTy = mlir::dyn_cast(ty)) { - genBoxCombiner(builder, loc, redId, boxTy, lhs, rhs); + genBoxCombiner(builder, loc, redId, boxTy, lhs, rhs); return; } @@ -450,15 +492,13 @@ static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) { return ty; } +template static void createReductionAllocAndInitRegions( - AbstractConverter &converter, mlir::Location loc, - mlir::omp::DeclareReductionOp &reductionDecl, + AbstractConverter &converter, mlir::Location loc, OpType &reductionDecl, const ReductionProcessor::ReductionIdentifier redId, mlir::Type type, bool isByRef) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - auto yield = [&](mlir::Value ret) { - builder.create(loc, ret); - }; + auto yield = [&](mlir::Value ret) { genYield(builder, loc, ret); }; mlir::Block *allocBlock = nullptr; mlir::Block *initBlock = nullptr; @@ -508,7 +548,8 @@ static void createReductionAllocAndInitRegions( yield(boxAlloca); } -mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction( +template +OpType ReductionProcessor::createDeclareReduction( AbstractConverter &converter, llvm::StringRef reductionOpName, const ReductionIdentifier redId, mlir::Type type, mlir::Location loc, bool isByRef) { @@ -518,8 +559,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction( assert(!reductionOpName.empty()); - auto decl = - module.lookupSymbol(reductionOpName); + auto decl = module.lookupSymbol(reductionOpName); if (decl) return decl; @@ -528,8 +568,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction( if (!isByRef) type = valTy; - decl = modBuilder.create(loc, reductionOpName, - type); + decl = modBuilder.create(loc, reductionOpName, type); createReductionAllocAndInitRegions(converter, loc, decl, redId, type, isByRef); @@ -540,7 +579,7 @@ mlir::omp::DeclareReductionOp ReductionProcessor::createDeclareReduction( builder.setInsertionPointToEnd(&decl.getReductionRegion().back()); mlir::Value op1 = decl.getReductionRegion().front().getArgument(0); mlir::Value op2 = decl.getReductionRegion().front().getArgument(1); - genCombiner(builder, loc, redId, type, op1, op2, isByRef); + genCombiner(builder, loc, redId, type, op1, op2, isByRef); return decl; } @@ -559,64 +598,41 @@ static bool doReductionByRef(mlir::Value reductionVar) { return false; } -mlir::omp::ReductionModifier translateReductionModifier(ReductionModifier mod) { - switch (mod) { - case ReductionModifier::Default: - return mlir::omp::ReductionModifier::defaultmod; - case ReductionModifier::Inscan: - return mlir::omp::ReductionModifier::inscan; - case ReductionModifier::Task: - return mlir::omp::ReductionModifier::task; - } - return mlir::omp::ReductionModifier::defaultmod; -} - -template +template void ReductionProcessor::processReductionArguments( mlir::Location currentLocation, lower::AbstractConverter &converter, - const T &reduction, llvm::SmallVectorImpl &reductionVars, + const RedOperatorListTy &redOperatorList, + llvm::SmallVectorImpl &reductionVars, llvm::SmallVectorImpl &reduceVarByRef, llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl &reductionSymbols, - mlir::omp::ReductionModifierAttr *reductionMod) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - - if constexpr (std::is_same_v) { - auto mod = std::get>(reduction.t); - if (mod.has_value()) { - if (mod.value() == ReductionModifier::Task) - TODO(currentLocation, "Reduction modifier `task` is not supported"); - else - *reductionMod = mlir::omp::ReductionModifierAttr::get( - firOpBuilder.getContext(), translateReductionModifier(mod.value())); - } - } - - mlir::omp::DeclareReductionOp decl; - const auto &redOperatorList{ - std::get(reduction.t)}; - assert(redOperatorList.size() == 1 && "Expecting single operator"); - const auto &redOperator = redOperatorList.front(); - const auto &objectList{std::get(reduction.t)}; - - if (!std::holds_alternative(redOperator.u)) { - if (const auto *reductionIntrinsic = - std::get_if(&redOperator.u)) { - if (!ReductionProcessor::supportedIntrinsicProcReduction( - *reductionIntrinsic)) { + const llvm::SmallVectorImpl &reductionSymbols) { + if constexpr (std::is_same_v) { + // For OpenMP reduction clauses, check if the reduction operator is + // supported. + assert(redOperatorList.size() == 1 && "Expecting single operator"); + const Fortran::lower::omp::clause::ReductionOperator &redOperator = + redOperatorList.front(); + + if (!std::holds_alternative(redOperator.u)) { + if (const auto *reductionIntrinsic = + std::get_if(&redOperator.u)) { + if (!ReductionProcessor::supportedIntrinsicProcReduction( + *reductionIntrinsic)) { + return; + } + } else { return; } - } else { - return; } } + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + // Reduction variable processing common to both intrinsic operators and // procedure designators fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - for (const Object &object : objectList) { - const semantics::Symbol *symbol = object.sym(); - reductionSymbols.push_back(symbol); + for (const semantics::Symbol *symbol : reductionSymbols) { mlir::Value symVal = converter.getSymbolAddress(*symbol); mlir::Type eleType; auto refType = mlir::dyn_cast_or_null(symVal.getType()); @@ -668,6 +684,7 @@ void ReductionProcessor::processReductionArguments( reduceVarByRef.push_back(doReductionByRef(symVal)); } + unsigned idx = 0; for (auto [symVal, isByRef] : llvm::zip(reductionVars, reduceVarByRef)) { auto redType = mlir::cast(symVal.getType()); const auto &kindMap = firOpBuilder.getKindMap(); @@ -677,47 +694,56 @@ void ReductionProcessor::processReductionArguments( if (mlir::isa(redType.getEleTy())) redNameTy = builder.getI1Type(); - if (const auto &redDefinedOp = - std::get_if(&redOperator.u)) { - const auto &intrinsicOp{ - std::get( - redDefinedOp->u)}; - redId = getReductionType(intrinsicOp); - switch (redId) { - case ReductionIdentifier::ADD: - case ReductionIdentifier::MULTIPLY: - case ReductionIdentifier::AND: - case ReductionIdentifier::EQV: - case ReductionIdentifier::OR: - case ReductionIdentifier::NEQV: - break; - default: - TODO(currentLocation, - "Reduction of some intrinsic operators is not supported"); - break; - } - - reductionName = - getReductionName(intrinsicOp, kindMap, redNameTy, isByRef); - } else if (const auto *reductionIntrinsic = - std::get_if( - &redOperator.u)) { - if (!ReductionProcessor::supportedIntrinsicProcReduction( - *reductionIntrinsic)) { - TODO(currentLocation, "Unsupported intrinsic proc reduction"); + if constexpr (std::is_same_v) { + const Fortran::lower::omp::clause::ReductionOperator &redOperator = + redOperatorList.front(); + if (const auto &redDefinedOp = + std::get_if(&redOperator.u)) { + const auto &intrinsicOp{ + std::get( + redDefinedOp->u)}; + redId = getReductionType(intrinsicOp); + switch (redId) { + case ReductionIdentifier::ADD: + case ReductionIdentifier::MULTIPLY: + case ReductionIdentifier::AND: + case ReductionIdentifier::EQV: + case ReductionIdentifier::OR: + case ReductionIdentifier::NEQV: + break; + default: + TODO(currentLocation, + "Reduction of some intrinsic operators is not supported"); + break; + } + + reductionName = getReductionName(redId, kindMap, redNameTy, isByRef); + } else if (const auto *reductionIntrinsic = + std::get_if( + &redOperator.u)) { + if (!ReductionProcessor::supportedIntrinsicProcReduction( + *reductionIntrinsic)) { + TODO(currentLocation, "Unsupported intrinsic proc reduction"); + } + redId = getReductionType(*reductionIntrinsic); + reductionName = + getReductionName(getRealName(*reductionIntrinsic).ToString(), + kindMap, redNameTy, isByRef); + } else { + TODO(currentLocation, "Unexpected reduction type"); } - redId = getReductionType(*reductionIntrinsic); - reductionName = - getReductionName(getRealName(*reductionIntrinsic).ToString(), kindMap, - redNameTy, isByRef); } else { - TODO(currentLocation, "Unexpected reduction type"); + // `do concurrent` reductions + redId = getReductionType(redOperatorList[idx]); + reductionName = getReductionName(redId, kindMap, redNameTy, isByRef); } - decl = createDeclareReduction(converter, reductionName, redId, redType, - currentLocation, isByRef); + OpType decl = createDeclareReduction( + converter, reductionName, redId, redType, currentLocation, isByRef); reductionDeclSymbols.push_back( mlir::SymbolRefAttr::get(firOpBuilder.getContext(), decl.getSymName())); + ++idx; } } diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h index a7198b48f6b4e..95b4b077bdc46 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.h +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h @@ -19,7 +19,6 @@ #include "flang/Parser/parse-tree.h" #include "flang/Semantics/symbol.h" #include "flang/Semantics/type.h" -#include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/Location.h" #include "mlir/IR/Types.h" @@ -65,6 +64,9 @@ class ReductionProcessor { static ReductionIdentifier getReductionType(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp); + static ReductionIdentifier + getReductionType(const fir::ReduceOperationEnum &pd); + static bool supportedIntrinsicProcReduction(const omp::clause::ProcedureDesignator &pd); @@ -78,10 +80,9 @@ class ReductionProcessor { const fir::KindMapping &kindMap, mlir::Type ty, bool isByRef); - static std::string - getReductionName(omp::clause::DefinedOperator::IntrinsicOperator intrinsicOp, - const fir::KindMapping &kindMap, mlir::Type ty, - bool isByRef); + static std::string getReductionName(ReductionIdentifier redId, + const fir::KindMapping &kindMap, + mlir::Type ty, bool isByRef); /// This function returns the identity value of the operator \p /// reductionOpName. For example: @@ -113,22 +114,23 @@ class ReductionProcessor { /// symbol table. The declaration has a constant initializer with the neutral /// value `initValue`, and the reduction combiner carried over from `reduce`. /// TODO: add atomic region. - static mlir::omp::DeclareReductionOp - createDeclareReduction(AbstractConverter &builder, - llvm::StringRef reductionOpName, - const ReductionIdentifier redId, mlir::Type type, - mlir::Location loc, bool isByRef); + template + static OpType createDeclareReduction(AbstractConverter &builder, + llvm::StringRef reductionOpName, + const ReductionIdentifier redId, + mlir::Type type, mlir::Location loc, + bool isByRef); /// Creates a reduction declaration and associates it with an OpenMP block /// directive. - template + template static void processReductionArguments( mlir::Location currentLocation, lower::AbstractConverter &converter, - const T &reduction, llvm::SmallVectorImpl &reductionVars, + const RedOperatorListTy &redOperatorList, + llvm::SmallVectorImpl &reductionVars, llvm::SmallVectorImpl &reduceVarByRef, llvm::SmallVectorImpl &reductionDeclSymbols, - llvm::SmallVectorImpl &reductionSymbols, - mlir::omp::ReductionModifierAttr *reductionMod = nullptr); + const llvm::SmallVectorImpl &reductionSymbols); }; template diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp index c65f51ce6cacd..b9d2574a76ad0 100644 --- a/flang/lib/Lower/Support/Utils.cpp +++ b/flang/lib/Lower/Support/Utils.cpp @@ -668,9 +668,7 @@ void privatizeSymbol( const semantics::Symbol *sym = isDoConcurrent ? &symToPrivatize->GetUltimate() : symToPrivatize; - const lower::SymbolBox hsb = isDoConcurrent - ? converter.shallowLookupSymbol(*sym) - : converter.lookupOneLevelUpSymbol(*sym); + const lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym); assert(hsb && "Host symbol box not found"); mlir::Location symLoc = hsb.getAddr().getLoc(); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index a3de3ae9d116a..b8ba7471d0ad6 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3294,26 +3294,26 @@ struct LoadOpConversion : public fir::FIROpConversion { } }; -struct LocalitySpecifierOpConversion - : public fir::FIROpConversion { - using FIROpConversion::FIROpConversion; +template +struct DoConcurrentSpecifierOpConversion : public fir::FIROpConversion { + using fir::FIROpConversion::FIROpConversion; llvm::LogicalResult - matchAndRewrite(fir::LocalitySpecifierOp localizer, OpAdaptor adaptor, + matchAndRewrite(OpTy specifier, typename OpTy::Adaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { #ifdef EXPENSIVE_CHECKS auto uses = mlir::SymbolTable::getSymbolUses( - localizer, localizer->getParentOfType()); + specifier, specifier->getParentOfType()); - // `fir.local` ops are not supposed to have any uses at this point (i.e. - // during lowering to LLVM). In case of serialization, the - // `fir.do_concurrent` users are expected to have been lowered to + // `fir.local|fir.declare_reduction` ops are not supposed to have any uses + // at this point (i.e. during lowering to LLVM). In case of serialization, + // the `fir.do_concurrent` users are expected to have been lowered to // `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent` // users are expected to have been lowered to the target parallel model // (e.g. OpenMP). assert(uses && uses->empty()); #endif - rewriter.eraseOp(localizer); + rewriter.eraseOp(specifier); return mlir::success(); } }; @@ -4268,20 +4268,22 @@ void fir::populateFIRToLLVMConversionPatterns( BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion, CmpcOpConversion, VolatileCastOpConversion, ConvertOpConversion, CoordinateOpConversion, CopyOpConversion, DTEntryOpConversion, - DeclareOpConversion, DivcOpConversion, EmboxOpConversion, - EmboxCharOpConversion, EmboxProcOpConversion, ExtractValueOpConversion, - FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion, - GlobalLenOpConversion, GlobalOpConversion, InsertOnRangeOpConversion, - IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion, - LocalitySpecifierOpConversion, MulcOpConversion, NegcOpConversion, - NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion, - SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion, - ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion, - StoreOpConversion, StringLitOpConversion, SubcOpConversion, - TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion, - UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion, - XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion, - ZeroOpConversion>(converter, options); + DeclareOpConversion, + DoConcurrentSpecifierOpConversion, + DoConcurrentSpecifierOpConversion, + DivcOpConversion, EmboxOpConversion, EmboxCharOpConversion, + EmboxProcOpConversion, ExtractValueOpConversion, FieldIndexOpConversion, + FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion, + GlobalOpConversion, InsertOnRangeOpConversion, IsPresentOpConversion, + LenParamIndexOpConversion, LoadOpConversion, MulcOpConversion, + NegcOpConversion, NoReassocOpConversion, SelectCaseOpConversion, + SelectOpConversion, SelectRankOpConversion, SelectTypeOpConversion, + ShapeOpConversion, ShapeShiftOpConversion, ShiftOpConversion, + SliceOpConversion, StoreOpConversion, StringLitOpConversion, + SubcOpConversion, TypeDescOpConversion, TypeInfoOpConversion, + UnboxCharOpConversion, UnboxProcOpConversion, UndefOpConversion, + UnreachableOpConversion, XArrayCoorOpConversion, XEmboxOpConversion, + XReboxOpConversion, ZeroOpConversion>(converter, options); // Patterns that are populated without a type converter do not trigger // target materializations for the operands of the root op. diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index ecfa2939e96a6..674129f546e94 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -5041,6 +5041,9 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns( // LocalitySpecifierOp //===----------------------------------------------------------------------===// +// TODO This is a copy of omp::PrivateClauseOp::verifiyRegions(). Once we find a +// solution to merge both ops into one this duplication will not be needed. See: +// https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108. llvm::LogicalResult fir::LocalitySpecifierOp::verifyRegions() { mlir::Type argType = getArgType(); auto verifyTerminator = [&](mlir::Operation *terminator, @@ -5136,6 +5139,84 @@ llvm::LogicalResult fir::LocalitySpecifierOp::verifyRegions() { return llvm::success(); } +// TODO This is a copy of omp::DeclareReductionOp::verifiyRegions(). Once we +// find a solution to merge both ops into one this duplication will not be +// needed. +mlir::LogicalResult fir::DeclareReductionOp::verifyRegions() { + if (!getAllocRegion().empty()) { + for (YieldOp yieldOp : getAllocRegion().getOps()) { + if (yieldOp.getResults().size() != 1 || + yieldOp.getResults().getTypes()[0] != getType()) + return emitOpError() << "expects alloc region to yield a value " + "of the reduction type"; + } + } + + if (getInitializerRegion().empty()) + return emitOpError() << "expects non-empty initializer region"; + mlir::Block &initializerEntryBlock = getInitializerRegion().front(); + + if (initializerEntryBlock.getNumArguments() == 1) { + if (!getAllocRegion().empty()) + return emitOpError() << "expects two arguments to the initializer region " + "when an allocation region is used"; + } else if (initializerEntryBlock.getNumArguments() == 2) { + if (getAllocRegion().empty()) + return emitOpError() << "expects one argument to the initializer region " + "when no allocation region is used"; + } else { + return emitOpError() + << "expects one or two arguments to the initializer region"; + } + + for (mlir::Value arg : initializerEntryBlock.getArguments()) + if (arg.getType() != getType()) + return emitOpError() << "expects initializer region argument to match " + "the reduction type"; + + for (YieldOp yieldOp : getInitializerRegion().getOps()) { + if (yieldOp.getResults().size() != 1 || + yieldOp.getResults().getTypes()[0] != getType()) + return emitOpError() << "expects initializer region to yield a value " + "of the reduction type"; + } + + if (getReductionRegion().empty()) + return emitOpError() << "expects non-empty reduction region"; + mlir::Block &reductionEntryBlock = getReductionRegion().front(); + if (reductionEntryBlock.getNumArguments() != 2 || + reductionEntryBlock.getArgumentTypes()[0] != + reductionEntryBlock.getArgumentTypes()[1] || + reductionEntryBlock.getArgumentTypes()[0] != getType()) + return emitOpError() << "expects reduction region with two arguments of " + "the reduction type"; + for (YieldOp yieldOp : getReductionRegion().getOps()) { + if (yieldOp.getResults().size() != 1 || + yieldOp.getResults().getTypes()[0] != getType()) + return emitOpError() << "expects reduction region to yield a value " + "of the reduction type"; + } + + if (!getAtomicReductionRegion().empty()) { + mlir::Block &atomicReductionEntryBlock = getAtomicReductionRegion().front(); + if (atomicReductionEntryBlock.getNumArguments() != 2 || + atomicReductionEntryBlock.getArgumentTypes()[0] != + atomicReductionEntryBlock.getArgumentTypes()[1]) + return emitOpError() << "expects atomic reduction region with two " + "arguments of the same type"; + } + + if (getCleanupRegion().empty()) + return mlir::success(); + mlir::Block &cleanupEntryBlock = getCleanupRegion().front(); + if (cleanupEntryBlock.getNumArguments() != 1 || + cleanupEntryBlock.getArgument(0).getType() != getType()) + return emitOpError() << "expects cleanup region with one argument " + "of the reduction type"; + + return mlir::success(); +} + //===----------------------------------------------------------------------===// // DoConcurrentOp //===----------------------------------------------------------------------===// @@ -5157,6 +5238,86 @@ llvm::LogicalResult fir::DoConcurrentOp::verify() { // DoConcurrentLoopOp //===----------------------------------------------------------------------===// +static mlir::ParseResult parseSpecifierList( + mlir::OpAsmParser &parser, mlir::OperationState &result, + llvm::StringRef specifierKeyword, llvm::StringRef symsAttrName, + llvm::SmallVectorImpl ®ionArgs, + llvm::SmallVectorImpl ®ionArgTypes, + int32_t &numSpecifierOperands, bool isReduce = false) { + auto &builder = parser.getBuilder(); + llvm::SmallVector specifierOperands; + + if (failed(parser.parseOptionalKeyword(specifierKeyword))) + return mlir::success(); + + std::size_t oldArgTypesSize = regionArgTypes.size(); + if (failed(parser.parseLParen())) + return mlir::failure(); + + llvm::SmallVector spceifierSymbolVec; + llvm::SmallVector attributes; + + if (failed(parser.parseCommaSeparatedList([&]() { + if (failed(parser.parseAttribute(spceifierSymbolVec.emplace_back()))) + return mlir::failure(); + + if (isReduce && + failed(parser.parseAttribute(attributes.emplace_back()))) { + return mlir::failure(); + } + + if (parser.parseOperand(specifierOperands.emplace_back()) || + parser.parseArrow() || + parser.parseArgument(regionArgs.emplace_back())) + return mlir::failure(); + + return mlir::success(); + }))) + return mlir::failure(); + + if (failed(parser.parseColon())) + return mlir::failure(); + + if (failed(parser.parseCommaSeparatedList([&]() { + if (failed(parser.parseType(regionArgTypes.emplace_back()))) + return mlir::failure(); + + return mlir::success(); + }))) + return mlir::failure(); + + if (regionArgs.size() != regionArgTypes.size()) + return parser.emitError(parser.getNameLoc(), "mismatch in number of " + + specifierKeyword.str() + + " arg and types"); + + if (failed(parser.parseRParen())) + return mlir::failure(); + + for (auto operandType : + llvm::zip_equal(specifierOperands, + llvm::drop_begin(regionArgTypes, oldArgTypesSize))) + if (parser.resolveOperand(std::get<0>(operandType), + std::get<1>(operandType), result.operands)) + return mlir::failure(); + + llvm::SmallVector symbolAttrs(spceifierSymbolVec.begin(), + spceifierSymbolVec.end()); + result.addAttribute(symsAttrName, builder.getArrayAttr(symbolAttrs)); + + if (isReduce) { + llvm::SmallVector arrayAttr(attributes.begin(), + attributes.end()); + result.addAttribute( + fir::DoConcurrentLoopOp::getReduceAttrsAttrName(result.name), + builder.getArrayAttr(arrayAttr)); + } + + numSpecifierOperands = specifierOperands.size(); + + return mlir::success(); +} + mlir::ParseResult fir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser, mlir::OperationState &result) { auto &builder = parser.getBuilder(); @@ -5192,90 +5353,26 @@ mlir::ParseResult fir::DoConcurrentLoopOp::parse(mlir::OpAsmParser &parser, parser.resolveOperands(steps, builder.getIndexType(), result.operands)) return mlir::failure(); - llvm::SmallVector reduceOperands; - llvm::SmallVector reduceArgTypes; - if (succeeded(parser.parseOptionalKeyword("reduce"))) { - // Parse reduction attributes and variables. - llvm::SmallVector attributes; - if (failed(parser.parseCommaSeparatedList( - mlir::AsmParser::Delimiter::Paren, [&]() { - if (parser.parseAttribute(attributes.emplace_back()) || - parser.parseArrow() || - parser.parseOperand(reduceOperands.emplace_back()) || - parser.parseColonType(reduceArgTypes.emplace_back())) - return mlir::failure(); - return mlir::success(); - }))) - return mlir::failure(); - // Resolve input operands. - for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes)) - if (parser.resolveOperand(std::get<0>(operand_type), - std::get<1>(operand_type), result.operands)) - return mlir::failure(); - llvm::SmallVector arrayAttr(attributes.begin(), - attributes.end()); - result.addAttribute(getReduceAttrsAttrName(result.name), - builder.getArrayAttr(arrayAttr)); - } - - llvm::SmallVector localOperands; - if (succeeded(parser.parseOptionalKeyword("local"))) { - std::size_t oldArgTypesSize = argTypes.size(); - if (failed(parser.parseLParen())) - return mlir::failure(); - - llvm::SmallVector localSymbolVec; - if (failed(parser.parseCommaSeparatedList([&]() { - if (failed(parser.parseAttribute(localSymbolVec.emplace_back()))) - return mlir::failure(); - - if (parser.parseOperand(localOperands.emplace_back()) || - parser.parseArrow() || - parser.parseArgument(regionArgs.emplace_back())) - return mlir::failure(); - - return mlir::success(); - }))) - return mlir::failure(); - - if (failed(parser.parseColon())) - return mlir::failure(); - - if (failed(parser.parseCommaSeparatedList([&]() { - if (failed(parser.parseType(argTypes.emplace_back()))) - return mlir::failure(); - - return mlir::success(); - }))) - return mlir::failure(); - - if (regionArgs.size() != argTypes.size()) - return parser.emitError(parser.getNameLoc(), - "mismatch in number of local arg and types"); - - if (failed(parser.parseRParen())) - return mlir::failure(); - - for (auto operandType : llvm::zip_equal( - localOperands, llvm::drop_begin(argTypes, oldArgTypesSize))) - if (parser.resolveOperand(std::get<0>(operandType), - std::get<1>(operandType), result.operands)) - return mlir::failure(); + int32_t numLocalOperands = 0; + if (failed(parseSpecifierList(parser, result, "local", + getLocalSymsAttrName(result.name), regionArgs, + argTypes, numLocalOperands))) + return mlir::failure(); - llvm::SmallVector symbolAttrs(localSymbolVec.begin(), - localSymbolVec.end()); - result.addAttribute(getLocalSymsAttrName(result.name), - builder.getArrayAttr(symbolAttrs)); - } + int32_t numReduceOperands = 0; + if (failed(parseSpecifierList( + parser, result, "reduce", getReduceSymsAttrName(result.name), + regionArgs, argTypes, numReduceOperands, /*isReduce=*/true))) + return mlir::failure(); // Set `operandSegmentSizes` attribute. - result.addAttribute(DoConcurrentLoopOp::getOperandSegmentSizeAttr(), - builder.getDenseI32ArrayAttr( - {static_cast(lower.size()), - static_cast(upper.size()), - static_cast(steps.size()), - static_cast(reduceOperands.size()), - static_cast(localOperands.size())})); + result.addAttribute( + DoConcurrentLoopOp::getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr({static_cast(lower.size()), + static_cast(upper.size()), + static_cast(steps.size()), + static_cast(numLocalOperands), + static_cast(numReduceOperands)})); // Now parse the body. for (auto [arg, type] : llvm::zip_equal(regionArgs, argTypes)) @@ -5297,17 +5394,6 @@ void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) { << ") = (" << getLowerBound() << ") to (" << getUpperBound() << ") step (" << getStep() << ")"; - if (!getReduceOperands().empty()) { - p << " reduce("; - auto attrs = getReduceAttrsAttr(); - auto operands = getReduceOperands(); - llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) { - p << std::get<0>(it) << " -> " << std::get<1>(it) << " : " - << std::get<1>(it).getType(); - }); - p << ')'; - } - if (!getLocalVars().empty()) { p << " local("; llvm::interleaveComma(llvm::zip_equal(getLocalSymsAttr(), getLocalVars(), @@ -5322,13 +5408,30 @@ void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) { p << ")"; } + if (!getReduceVars().empty()) { + p << " reduce("; + llvm::interleaveComma( + llvm::zip_equal(getReduceSymsAttr(), getReduceAttrsAttr(), + getReduceVars(), getRegionReduceArgs()), + p, [&](auto it) { + p << std::get<0>(it) << " " << std::get<1>(it) << " " + << std::get<2>(it) << " -> " << std::get<3>(it); + }); + p << " : "; + llvm::interleaveComma(getReduceVars(), p, + [&](auto it) { p << it.getType(); }); + p << ")"; + } + p << ' '; p.printRegion(getRegion(), /*printEntryBlockArgs=*/false); p.printOptionalAttrDict( (*this)->getAttrs(), /*elidedAttrs=*/{DoConcurrentLoopOp::getOperandSegmentSizeAttr(), + DoConcurrentLoopOp::getLocalSymsAttrName(), + DoConcurrentLoopOp::getReduceSymsAttrName(), DoConcurrentLoopOp::getReduceAttrsAttrName(), - DoConcurrentLoopOp::getLocalSymsAttrName()}); + DoConcurrentLoopOp::getReduceByrefAttrName()}); } llvm::SmallVector fir::DoConcurrentLoopOp::getLoopRegions() { @@ -5340,6 +5443,7 @@ llvm::LogicalResult fir::DoConcurrentLoopOp::verify() { mlir::Operation::operand_range ubValues = getUpperBound(); mlir::Operation::operand_range stepValues = getStep(); mlir::Operation::operand_range localVars = getLocalVars(); + mlir::Operation::operand_range reduceVars = getReduceVars(); if (lbValues.empty()) return emitOpError( @@ -5353,7 +5457,8 @@ llvm::LogicalResult fir::DoConcurrentLoopOp::verify() { // Check that the body defines the same number of block arguments as the // number of tuple elements in step. mlir::Block *body = getBody(); - unsigned numIndVarArgs = body->getNumArguments() - localVars.size(); + unsigned numIndVarArgs = + body->getNumArguments() - localVars.size() - reduceVars.size(); if (numIndVarArgs != stepValues.size()) return emitOpError() << "expects the same number of induction variables: " diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp index e440852b3103a..506c8e66dbdfa 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp @@ -234,6 +234,10 @@ class DoConcurrentConversion loop.setLocalSymsAttr(nullptr); } + for (auto [reduceVar, reduceArg] : + llvm::zip_equal(loop.getReduceVars(), loop.getRegionReduceArgs())) + rewriter.replaceAllUsesWith(reduceArg, reduceVar); + // Collect iteration variable(s) allocations so that we can move them // outside the `fir.do_concurrent` wrapper. llvm::SmallVector opsToMove; @@ -257,12 +261,16 @@ class DoConcurrentConversion innermostUnorderdLoop = rewriter.create( doConcurentOp.getLoc(), lb, ub, st, /*unordred=*/true, /*finalCountValue=*/false, - /*iterArgs=*/std::nullopt, loop.getReduceOperands(), + /*iterArgs=*/std::nullopt, loop.getReduceVars(), loop.getReduceAttrsAttr()); ivArgs.push_back(innermostUnorderdLoop.getInductionVar()); rewriter.setInsertionPointToStart(innermostUnorderdLoop.getBody()); } + loop.getRegion().front().eraseArguments(loop.getNumInductionVars() + + loop.getNumLocalOperands(), + loop.getNumReduceOperands()); + rewriter.inlineBlockBefore( &loopBlock, innermostUnorderdLoop.getBody()->getTerminator(), ivArgs); rewriter.eraseOp(doConcurentOp); diff --git a/flang/test/Fir/do_concurrent.fir b/flang/test/Fir/do_concurrent.fir index cc1197ba56bd7..c207ea5dea825 100644 --- a/flang/test/Fir/do_concurrent.fir +++ b/flang/test/Fir/do_concurrent.fir @@ -63,7 +63,7 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index, %j = fir.alloca i32 fir.do_concurrent.loop (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) - reduce(#fir.reduce_attr -> %sum : !fir.ref) { + reduce(@add_reduction_i32 #fir.reduce_attr %sum -> %sum_arg : !fir.ref) { %0 = fir.convert %i_iv : (index) -> i32 fir.store %0 to %i : !fir.ref @@ -83,7 +83,7 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index, // CHECK: %[[I:.*]] = fir.alloca i32 // CHECK: %[[J:.*]] = fir.alloca i32 // CHECK: fir.do_concurrent.loop -// CHECK-SAME: (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(#fir.reduce_attr -> %[[SUM]] : !fir.ref) { +// CHECK-SAME: (%[[I_IV:.*]], %[[J_IV:.*]]) = (%[[I_LB]], %[[J_LB]]) to (%[[I_UB]], %[[J_UB]]) step (%[[I_ST]], %[[J_ST]]) reduce(@add_reduction_i32 #fir.reduce_attr %[[SUM]] -> %{{.*}} : !fir.ref) { // CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32 // CHECK: fir.store %[[I_IV_CVT]] to %[[I]] : !fir.ref // CHECK: %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32 @@ -161,3 +161,62 @@ func.func @do_concurrent_with_locality_specs() { // CHECK: } // CHECK: return // CHECK: } + +func.func @dc_reduce() { + %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"} + %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %c1 = arith.constant 1 : index + fir.do_concurrent { + fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(@add_reduction_i32 #fir.reduce_attr %4#0 -> %arg1 : !fir.ref) { + } + } + return +} + +// CHECK-LABEL: func.func @dc_reduce() { +// CHECK: %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"} +// CHECK: %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) +// CHECK: fir.do_concurrent { +// CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) reduce(@add_reduction_i32 #fir.reduce_attr %[[S_DECL]]#0 -> %[[S_ARG:.*]] : !fir.ref) { +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +func.func @dc_reduce_2() { + %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"} + %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + %5 = fir.alloca i32 {bindc_name = "m", uniq_name = "dc_reduce"} + %6:2 = hlfir.declare %5 {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + %c1 = arith.constant 1 : index + + fir.do_concurrent { + fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) + reduce(@add_reduction_i32 #fir.reduce_attr %4#0 -> %arg1, + @mul_reduction_i32 #fir.reduce_attr %6#0 -> %arg2 + : !fir.ref, !fir.ref) { + } + } + + return +} + +// CHECK-LABEL: func.func @dc_reduce_2() { +// CHECK: %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"} +// CHECK: %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) + +// CHECK: %[[M_ALLOC:.*]] = fir.alloca i32 {bindc_name = "m", uniq_name = "dc_reduce"} +// CHECK: %[[M_DECL:.*]]:2 = hlfir.declare %[[M_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) +// CHECK: fir.do_concurrent { +// CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{[^[:space:]]+}}) +// CHECK-SAME: reduce( +// CHECK-SAME: @add_reduction_i32 #fir.reduce_attr %[[S_DECL]]#0 -> %[[S_ARG:[^,]+]], +// CHECK-SAME: @mul_reduction_i32 #fir.reduce_attr %[[M_DECL]]#0 -> %[[M_ARG:[^[:space:]]+]] +// CHECK-SAME: : !fir.ref, !fir.ref) { +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir index aca0ecc1abdc1..e32ea7ad3c729 100644 --- a/flang/test/Fir/invalid.fir +++ b/flang/test/Fir/invalid.fir @@ -1256,8 +1256,8 @@ func.func @dc_invalid_reduction(%arg0: index, %arg1: index) { %sum = fir.alloca i32 // expected-error@+2 {{'fir.do_concurrent.loop' op mismatch in number of reduction variables and reduction attributes}} fir.do_concurrent { - "fir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array}> ({ - ^bb0(%arg3: index): + "fir.do_concurrent.loop"(%arg0, %arg1, %arg0, %sum) <{operandSegmentSizes = array}> ({ + ^bb0(%arg3: index, %sum_arg: i32): %tmp = "fir.alloca"() <{in_type = i32, operandSegmentSizes = array}> : () -> !fir.ref }) : (index, index, index, !fir.ref) -> () } @@ -1266,6 +1266,20 @@ func.func @dc_invalid_reduction(%arg0: index, %arg1: index) { // ----- +func.func @dc_reduce_no_attr() { + %3 = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"} + %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %c1 = arith.constant 1 : index + // expected-error@+2 {{expected attribute value}} + fir.do_concurrent { + fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(@add_reduction_i32 %4#0 -> %arg1 : !fir.ref) { + } + } + return +} + +// ----- + // Should fail when volatility changes from a fir.convert func.func @bad_convert_volatile(%arg0: !fir.ref) -> !fir.ref { // expected-error@+1 {{op this conversion does not preserve volatility}} diff --git a/flang/test/Lower/do_concurrent_reduce.f90 b/flang/test/Lower/do_concurrent_reduce.f90 new file mode 100644 index 0000000000000..8591a21e2b9e0 --- /dev/null +++ b/flang/test/Lower/do_concurrent_reduce.f90 @@ -0,0 +1,41 @@ +! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s + +subroutine do_concurrent_reduce + implicit none + integer :: s, i + + do concurrent (i=1:10) reduce(+:s) + s = s + 1 + end do +end + +! CHECK-LABEL: fir.declare_reduction @add_reduction_i32 : i32 init { +! CHECK: ^bb0(%[[ARG0:.*]]: i32): +! CHECK: %[[VAL_0:.*]] = arith.constant 0 : i32 +! CHECK: fir.yield(%[[VAL_0]] : i32) +! CHECK: } combiner { +! CHECK: ^bb0(%[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: i32): +! CHECK: %[[VAL_3:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : i32 +! CHECK: fir.yield(%[[VAL_3]] : i32) +! CHECK: } + +! CHECK-LABEL: func.func @_QPdo_concurrent_reduce() { +! CHECK: %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "_QFdo_concurrent_reduceEs"} +! CHECK: %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref) -> (!fir.ref, !fir.ref) + +! CHECK: fir.do_concurrent { +! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdo_concurrent_reduceEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{[^[:space:]]+}}) +! CHECK-SAME: reduce(@add_reduction_i32 #fir.reduce_attr %[[S_DECL]]#0 -> %[[S_ARG:.*]] : !fir.ref) { + +! CHECK: %[[S_ARG_DECL:.*]]:2 = hlfir.declare %[[S_ARG]] {uniq_name = "_QFdo_concurrent_reduceEs"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[S_ARG_VAL:.*]] = fir.load %[[S_ARG_DECL]]#0 : !fir.ref +! CHECK: %[[C1:.*]] = arith.constant 1 : i32 +! CHECK: %[[RED_UPDATE:.*]] = arith.addi %[[S_ARG_VAL]], %[[C1]] : i32 +! CHECK: hlfir.assign %[[RED_UPDATE]] to %[[S_ARG_DECL]]#0 : i32, !fir.ref + +! CHECK: } +! CHECK: } +! CHECK: return +! CHECK: } diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 60df27a591dc3..64f14ff972272 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -1,4 +1,4 @@ -! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s +! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s ! CHECK-LABEL: loop_test subroutine loop_test diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 84db1972cca16..2965b954b49a8 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -12,7 +12,7 @@ subroutine loop_test ! CHECK: %[[VAL_0:.*]] = fir.alloca f32 {bindc_name = "m", uniq_name = "_QFloop_testEm"} ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFloop_testEsum) : !fir.ref - ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(#fir.reduce_attr -> %[[VAL_1:.*]] : !fir.ref, #fir.reduce_attr -> %[[VAL_0:.*]] : !fir.ref) { + ! CHECK: fir.do_concurrent.loop ({{.*}}) = ({{.*}}) to ({{.*}}) step ({{.*}}) reduce(@add_reduction_i32 #fir.reduce_attr %[[VAL_1]] -> %{{.*}}, @other_reduction_f32 #fir.reduce_attr %[[VAL_0]] -> %{{.*}} : {{.*}}) { do concurrent (i=1:5, j=1:5, k=1:5) local(tmp) reduce(+:sum) reduce(max:m) tmp = i + j + k sum = tmp + sum diff --git a/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir b/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir index d9ef36b175598..c550ab8a97d4c 100644 --- a/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir +++ b/flang/test/Transforms/do_concurrent-to-do_loop-unodered.fir @@ -86,7 +86,7 @@ func.func @dc_2d_reduction(%i_lb: index, %i_ub: index, %i_st: index, %j = fir.alloca i32 fir.do_concurrent.loop (%i_iv, %j_iv) = (%i_lb, %j_lb) to (%i_ub, %j_ub) step (%i_st, %j_st) - reduce(#fir.reduce_attr -> %sum : !fir.ref) { + reduce(@add_reduction_i32 #fir.reduce_attr %sum -> %sum_arg : !fir.ref) { %0 = fir.convert %i_iv : (index) -> i32 fir.store %0 to %i : !fir.ref From 3d66aaaa7b8517b23c5eefb9ebd567850f407918 Mon Sep 17 00:00:00 2001 From: ergawy Date: Thu, 26 Jun 2025 23:17:37 -0500 Subject: [PATCH 2/3] review comments --- flang/include/flang/Optimizer/Dialect/FIROps.td | 4 ++-- flang/lib/Lower/Bridge.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index b217f1433767f..41363234e5c6b 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3667,10 +3667,10 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove, let summary = "declares a reduction kind"; let description = [{ Note: this operation is adapted from omp::DeclareReductionOp. There is a lot - duplication at the moment. TODO Combined both ops into one. See: + duplication at the moment. TODO Combine both ops into one. See: https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108. - Declares an `do concurrent` reduction. This requires two mandatory and three + Declares a `do concurrent` reduction. This requires two mandatory and three optional regions. 1. The optional alloc region specifies how to allocate the thread-local diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index bc2b9f2fcb48a..2f896d273b580 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2104,10 +2104,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { llvm::SmallVector reductionDeclSymbols; llvm::SmallVector nestReduceAttrs; - for (const auto &reduceOp : info.reduceOperatorList) { + for (const auto &reduceOp : info.reduceOperatorList) nestReduceAttrs.push_back( fir::ReduceAttr::get(builder->getContext(), reduceOp)); - } llvm::SmallVector reduceVars; Fortran::lower::omp::ReductionProcessor rp; @@ -2543,7 +2542,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->setInsertionPointToEnd(loopWrapperOp.getBody()); auto loopOp = builder->create( - loc, nestLBs, nestUBs, nestSts, nullptr, /*local_vars=*/std::nullopt, + loc, nestLBs, nestUBs, nestSts, /*loopAnnotation=*/nullptr, + /*local_vars=*/std::nullopt, /*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt, /*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr, /*reduce_attrs=*/nullptr); From 51e15e603c76a2c0fb558d60d5fff9d0364b14de Mon Sep 17 00:00:00 2001 From: ergawy Date: Fri, 27 Jun 2025 01:54:06 -0500 Subject: [PATCH 3/3] byref printing and parsing --- flang/lib/Optimizer/Dialect/FIROps.cpp | 27 ++++++++++++++++++++------ flang/test/Fir/do_concurrent.fir | 4 ++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 674129f546e94..6b40e7015fdd8 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -5254,17 +5254,21 @@ static mlir::ParseResult parseSpecifierList( if (failed(parser.parseLParen())) return mlir::failure(); + llvm::SmallVector isByRefVec; llvm::SmallVector spceifierSymbolVec; llvm::SmallVector attributes; if (failed(parser.parseCommaSeparatedList([&]() { + if (isReduce) + isByRefVec.push_back( + parser.parseOptionalKeyword("byref").succeeded()); + if (failed(parser.parseAttribute(spceifierSymbolVec.emplace_back()))) return mlir::failure(); if (isReduce && - failed(parser.parseAttribute(attributes.emplace_back()))) { + failed(parser.parseAttribute(attributes.emplace_back()))) return mlir::failure(); - } if (parser.parseOperand(specifierOperands.emplace_back()) || parser.parseArrow() || @@ -5301,6 +5305,13 @@ static mlir::ParseResult parseSpecifierList( std::get<1>(operandType), result.operands)) return mlir::failure(); + if (isReduce) + result.addAttribute( + fir::DoConcurrentLoopOp::getReduceByrefAttrName(result.name), + isByRefVec.empty() + ? nullptr + : mlir::DenseBoolArrayAttr::get(builder.getContext(), isByRefVec)); + llvm::SmallVector symbolAttrs(spceifierSymbolVec.begin(), spceifierSymbolVec.end()); result.addAttribute(symsAttrName, builder.getArrayAttr(symbolAttrs)); @@ -5411,11 +5422,15 @@ void fir::DoConcurrentLoopOp::print(mlir::OpAsmPrinter &p) { if (!getReduceVars().empty()) { p << " reduce("; llvm::interleaveComma( - llvm::zip_equal(getReduceSymsAttr(), getReduceAttrsAttr(), - getReduceVars(), getRegionReduceArgs()), + llvm::zip_equal(getReduceByrefAttr().asArrayRef(), getReduceSymsAttr(), + getReduceAttrsAttr(), getReduceVars(), + getRegionReduceArgs()), p, [&](auto it) { - p << std::get<0>(it) << " " << std::get<1>(it) << " " - << std::get<2>(it) << " -> " << std::get<3>(it); + if (std::get<0>(it)) + p << "byref "; + + p << std::get<1>(it) << " " << std::get<2>(it) << " " + << std::get<3>(it) << " -> " << std::get<4>(it); }); p << " : "; llvm::interleaveComma(getReduceVars(), p, diff --git a/flang/test/Fir/do_concurrent.fir b/flang/test/Fir/do_concurrent.fir index c207ea5dea825..6e2173447855e 100644 --- a/flang/test/Fir/do_concurrent.fir +++ b/flang/test/Fir/do_concurrent.fir @@ -167,7 +167,7 @@ func.func @dc_reduce() { %4:2 = hlfir.declare %3 {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) %c1 = arith.constant 1 : index fir.do_concurrent { - fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(@add_reduction_i32 #fir.reduce_attr %4#0 -> %arg1 : !fir.ref) { + fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) reduce(byref @add_reduction_i32 #fir.reduce_attr %4#0 -> %arg1 : !fir.ref) { } } return @@ -177,7 +177,7 @@ func.func @dc_reduce() { // CHECK: %[[S_ALLOC:.*]] = fir.alloca i32 {bindc_name = "s", uniq_name = "dc_reduce"} // CHECK: %[[S_DECL:.*]]:2 = hlfir.declare %[[S_ALLOC]] {uniq_name = "dc_reduce"} : (!fir.ref) -> (!fir.ref, !fir.ref) // CHECK: fir.do_concurrent { -// CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) reduce(@add_reduction_i32 #fir.reduce_attr %[[S_DECL]]#0 -> %[[S_ARG:.*]] : !fir.ref) { +// CHECK: fir.do_concurrent.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) reduce(byref @add_reduction_i32 #fir.reduce_attr %[[S_DECL]]#0 -> %[[S_ARG:.*]] : !fir.ref) { // CHECK: } // CHECK: } // CHECK: return