Skip to content

[flang][do concurrent] Re-model reduce to match reductions are modelled in OpenMP and OpenACC #145837

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion flang/include/flang/Optimizer/Dialect/FIRAttr.td
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def fir_ReduceOperationEnum : I32BitEnumAttr<"ReduceOperationEnum",
I32BitEnumAttrCaseBit<"MIN", 7, "min">,
I32BitEnumAttrCaseBit<"IAND", 8, "iand">,
I32BitEnumAttrCaseBit<"IOR", 9, "ior">,
I32BitEnumAttrCaseBit<"EIOR", 10, "eior">
I32BitEnumAttrCaseBit<"IEOR", 10, "ieor">
]> {
let separator = ", ";
let cppNamespace = "::fir";
Expand Down
136 changes: 128 additions & 8 deletions flang/include/flang/Optimizer/Dialect/FIROps.td
Original file line number Diff line number Diff line change
Expand Up @@ -3518,7 +3518,7 @@ def fir_BoxTotalElementsOp

def YieldOp : fir_Op<"yield",
[Pure, ReturnLike, Terminator,
ParentOneOf<["LocalitySpecifierOp"]>]> {
ParentOneOf<["LocalitySpecifierOp", "DeclareReductionOp"]>]> {
let summary = "loop yield and termination operation";
let description = [{
"fir.yield" yields SSA values from a fir dialect op region and
Expand Down Expand Up @@ -3662,6 +3662,103 @@ def fir_LocalitySpecifierOp : fir_Op<"local", [IsolatedFromAbove]> {
let hasRegionVerifier = 1;
}

def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove,
Symbol]> {
let summary = "declares a reduction kind";
let description = [{
Note: this operation is adapted from omp::DeclareReductionOp. There is a lot
duplication at the moment. TODO Combine both ops into one. See:
https://discourse.llvm.org/t/dialect-for-data-locality-sharing-specifiers-clauses-in-openmp-openacc-and-do-concurrent/86108.

Declares a `do concurrent` reduction. This requires two mandatory and three
optional regions.

1. The optional alloc region specifies how to allocate the thread-local
reduction value. This region should not contain control flow and all
IR should be suitable for inlining straight into an entry block. In
the common case this is expected to contain only allocas. It is
expected to `fir.yield` the allocated value on all control paths.
If allocation is conditional (e.g. only allocate if the mold is
allocated), this should be done in the initilizer region and this
region not included. The alloc region is not used for by-value
reductions (where allocation is implicit).
2. The initializer region specifies how to initialize the thread-local
reduction value. This is usually the neutral element of the reduction.
For convenience, the region has an argument that contains the value
of the reduction accumulator at the start of the reduction. If an alloc
region is specified, there is a second block argument containing the
address of the allocated memory. The initializer region is expected to
`fir.yield` the new value on all control flow paths.
3. The reduction region specifies how to combine two values into one, i.e.
the reduction operator. It accepts the two values as arguments and is
expected to `fir.yield` the combined value on all control flow paths.
4. The atomic reduction region is optional and specifies how two values
can be combined atomically given local accumulator variables. It is
expected to store the combined value in the first accumulator variable.
5. The cleanup region is optional and specifies how to clean up any memory
allocated by the initializer region. The region has an argument that
contains the value of the thread-local reduction accumulator. This will
be executed after the reduction has completed.

Note that the MLIR type system does not allow for type-polymorphic
reductions. Separate reduction declarations should be created for different
element and accumulator types.

For initializer and reduction regions, the operand to `fir.yield` must
match the parent operation's results.
}];

let arguments = (ins SymbolNameAttr:$sym_name,
TypeAttr:$type);

let regions = (region MaxSizedRegion<1>:$allocRegion,
AnyRegion:$initializerRegion,
AnyRegion:$reductionRegion,
AnyRegion:$atomicReductionRegion,
AnyRegion:$cleanupRegion);

let assemblyFormat = "$sym_name `:` $type attr-dict-with-keyword "
"( `alloc` $allocRegion^ )? "
"`init` $initializerRegion "
"`combiner` $reductionRegion "
"( `atomic` $atomicReductionRegion^ )? "
"( `cleanup` $cleanupRegion^ )? ";

let extraClassDeclaration = [{
mlir::BlockArgument getAllocMoldArg() {
auto &region = getAllocRegion();
return region.empty() ? nullptr : region.getArgument(0);
}
mlir::BlockArgument getInitializerMoldArg() {
return getInitializerRegion().getArgument(0);
}
mlir::BlockArgument getInitializerAllocArg() {
return getAllocRegion().empty() ?
nullptr : getInitializerRegion().getArgument(1);
}
mlir::BlockArgument getReductionLhsArg() {
return getReductionRegion().getArgument(0);
}
mlir::BlockArgument getReductionRhsArg() {
return getReductionRegion().getArgument(1);
}
mlir::BlockArgument getAtomicReductionLhsArg() {
auto &region = getAtomicReductionRegion();
return region.empty() ? nullptr : region.getArgument(0);
}
mlir::BlockArgument getAtomicReductionRhsArg() {
auto &region = getAtomicReductionRegion();
return region.empty() ? nullptr : region.getArgument(1);
}
mlir::BlockArgument getCleanupAllocArg() {
auto &region = getCleanupRegion();
return region.empty() ? nullptr : region.getArgument(0);
}
}];

let hasRegionVerifier = 1;
}

def fir_DoConcurrentOp : fir_Op<"do_concurrent",
[SingleBlock, AutomaticAllocationScope]> {
let summary = "do concurrent loop wrapper";
Expand Down Expand Up @@ -3700,6 +3797,25 @@ def fir_LocalSpecifier {
);
}

def fir_ReduceSpecifier {
dag arguments = (ins
Variadic<AnyType>:$reduce_vars,
OptionalAttr<DenseBoolArrayAttr>:$reduce_byref,

// This introduces redundency in how reductions are modelled. In particular,
// a single reduction is represented by 2 attributes:
//
// 1. `$reduce_syms` which is a list of `DeclareReductionOp`s.
// 2. `$reduce_attrs` which is an array of `fir::ReduceAttr` values.
//
// The first makes it easier to map `do concurrent` to parallization models
// (e.g. OpenMP and OpenACC) while the second makes it easier to map it to
// nests of `fir.do_loop ... unodered` ops.
OptionalAttr<SymbolRefArrayAttr>:$reduce_syms,
OptionalAttr<ArrayAttr>:$reduce_attrs
);
}

def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
[AttrSizedOperandSegments, DeclareOpInterfaceMethods<LoopLikeOpInterface,
["getLoopInductionVars"]>,
Expand All @@ -3709,7 +3825,7 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
let description = [{
An operation that models a Fortran `do concurrent` loop's header and block.
This is a single-region single-block terminator op that is expected to
terminate the region of a `omp.do_concurrent` wrapper op.
terminate the region of a `fir.do_concurrent` wrapper op.

This op borrows from both `scf.parallel` and `fir.do_loop` ops. Similar to
`scf.parallel`, a loop nest takes 3 groups of SSA values as operands that
Expand Down Expand Up @@ -3747,8 +3863,6 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
- `lowerBound`: The group of SSA values for the nest's lower bounds.
- `upperBound`: The group of SSA values for the nest's upper bounds.
- `step`: The group of SSA values for the nest's steps.
- `reduceOperands`: The reduction SSA values, if any.
- `reduceAttrs`: Attributes to store reduction operations, if any.
- `loopAnnotation`: Loop metadata to be passed down the compiler pipeline to
LLVM.
}];
Expand All @@ -3757,12 +3871,12 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
Variadic<Index>:$lowerBound,
Variadic<Index>:$upperBound,
Variadic<Index>:$step,
Variadic<AnyType>:$reduceOperands,
OptionalAttr<ArrayAttr>:$reduceAttrs,
OptionalAttr<LoopAnnotationAttr>:$loopAnnotation
);

let arguments = !con(opArgs, fir_LocalSpecifier.arguments);
let arguments = !con(opArgs,
fir_LocalSpecifier.arguments,
fir_ReduceSpecifier.arguments);

let regions = (region SizedRegion<1>:$region);

Expand All @@ -3783,12 +3897,18 @@ def fir_DoConcurrentLoopOp : fir_Op<"do_concurrent.loop",
getNumLocalOperands());
}

mlir::Block::BlockArgListType getRegionReduceArgs() {
return getBody()->getArguments().slice(getNumInductionVars()
+ getNumLocalOperands(),
getNumReduceOperands());
}

/// Number of operands controlling the loop
unsigned getNumControlOperands() { return getLowerBound().size() * 3; }

// Get Number of reduction operands
unsigned getNumReduceOperands() {
return getReduceOperands().size();
return getReduceVars().size();
}

mlir::Operation::operand_range getLocalOperands() {
Expand Down
85 changes: 61 additions & 24 deletions flang/lib/Lower/Bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "flang/Lower/Bridge.h"

#include "OpenMP/ReductionProcessor.h"
#include "flang/Lower/Allocatable.h"
#include "flang/Lower/CallInterface.h"
#include "flang/Lower/Coarray.h"
Expand Down Expand Up @@ -127,9 +128,8 @@ struct IncrementLoopInfo {
bool isConcurrent;
llvm::SmallVector<const Fortran::semantics::Symbol *> localSymList;
llvm::SmallVector<const Fortran::semantics::Symbol *> localInitSymList;
llvm::SmallVector<
std::pair<fir::ReduceOperationEnum, const Fortran::semantics::Symbol *>>
reduceSymList;
llvm::SmallVector<const Fortran::semantics::Symbol *> reduceSymList;
llvm::SmallVector<fir::ReduceOperationEnum> reduceOperatorList;
llvm::SmallVector<const Fortran::semantics::Symbol *> sharedSymList;
mlir::Value loopVariable = nullptr;

Expand Down Expand Up @@ -1980,7 +1980,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
case Fortran::parser::ReductionOperator::Operator::Ior:
return fir::ReduceOperationEnum::IOR;
case Fortran::parser::ReductionOperator::Operator::Ieor:
return fir::ReduceOperationEnum::EIOR;
return fir::ReduceOperationEnum::IEOR;
}
llvm_unreachable("illegal reduction operator");
}
Expand Down Expand Up @@ -2014,8 +2014,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
std::get<Fortran::parser::ReductionOperator>(reduceList->t));
for (const Fortran::parser::Name &x :
std::get<std::list<Fortran::parser::Name>>(reduceList->t)) {
info.reduceSymList.push_back(
std::make_pair(reduce_operation, x.symbol));
info.reduceSymList.push_back(x.symbol);
info.reduceOperatorList.push_back(reduce_operation);
}
}
}
Expand Down Expand Up @@ -2076,6 +2076,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
assign.u = Fortran::evaluate::Assignment::BoundsSpec{};
genAssignment(assign);
}

for (const Fortran::semantics::Symbol *sym : info.sharedSymList) {
const auto *hostDetails =
sym->detailsIf<Fortran::semantics::HostAssocDetails>();
Expand All @@ -2099,6 +2100,45 @@ class FirConverter : public Fortran::lower::AbstractConverter {
}
}

llvm::SmallVector<bool> reduceVarByRef;
llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
llvm::SmallVector<mlir::Attribute> nestReduceAttrs;

for (const auto &reduceOp : info.reduceOperatorList)
nestReduceAttrs.push_back(
fir::ReduceAttr::get(builder->getContext(), reduceOp));

llvm::SmallVector<mlir::Value> reduceVars;
Fortran::lower::omp::ReductionProcessor rp;
rp.processReductionArguments<fir::DeclareReductionOp>(
toLocation(), *this, info.reduceOperatorList, reduceVars,
reduceVarByRef, reductionDeclSymbols, info.reduceSymList);

doConcurrentLoopOp.getReduceVarsMutable().assign(reduceVars);
doConcurrentLoopOp.setReduceSymsAttr(
reductionDeclSymbols.empty()
? nullptr
: mlir::ArrayAttr::get(builder->getContext(),
reductionDeclSymbols));
doConcurrentLoopOp.setReduceAttrsAttr(
nestReduceAttrs.empty()
? nullptr
: mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs));
doConcurrentLoopOp.setReduceByrefAttr(
reduceVarByRef.empty() ? nullptr
: mlir::DenseBoolArrayAttr::get(
builder->getContext(), reduceVarByRef));

for (auto [sym, reduceVar] :
llvm::zip_equal(info.reduceSymList, reduceVars)) {
auto arg = doConcurrentLoopOp.getRegion().begin()->addArgument(
reduceVar.getType(), doConcurrentLoopOp.getLoc());
bindSymbol(*sym, hlfir::translateToExtendedValue(
reduceVar.getLoc(), *builder, hlfir::Entity{arg},
/*contiguousHint=*/true)
.first);
}

// Note that allocatable, types with ultimate components, and type
// requiring finalization are forbidden in LOCAL/LOCAL_INIT (F2023 C1130),
// so no clean-up needs to be generated for these entities.
Expand Down Expand Up @@ -2190,6 +2230,12 @@ class FirConverter : public Fortran::lower::AbstractConverter {
}
}

// Introduce a `do concurrent` scope to bind symbols corresponding to local,
// local_init, and reduce region arguments.
if (!incrementLoopNestInfo.empty() &&
incrementLoopNestInfo.back().isConcurrent)
localSymbols.pushScope();

// Increment loop begin code. (Infinite/while code was already generated.)
if (!infiniteLoop && !whileCondition)
genFIRIncrementLoopBegin(incrementLoopNestInfo, doStmtEval.dirs);
Expand All @@ -2213,6 +2259,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {

// This call may generate a branch in some contexts.
genFIR(endDoEval, unstructuredContext);

if (!incrementLoopNestInfo.empty() &&
incrementLoopNestInfo.back().isConcurrent)
localSymbols.popScope();
}

/// Generate FIR to evaluate loop control values (lower, upper and step).
Expand Down Expand Up @@ -2395,19 +2445,6 @@ class FirConverter : public Fortran::lower::AbstractConverter {
info.stepVariable = builder->createTemporary(loc, stepValue.getType());
builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
}

if (genDoConcurrent && nestReduceOperands.empty()) {
// Create DO CONCURRENT reduce operands and attributes
for (const auto &reduceSym : info.reduceSymList) {
const fir::ReduceOperationEnum reduceOperation = reduceSym.first;
const Fortran::semantics::Symbol *sym = reduceSym.second;
fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
nestReduceOperands.push_back(fir::getBase(exv));
auto reduceAttr =
fir::ReduceAttr::get(builder->getContext(), reduceOperation);
nestReduceAttrs.push_back(reduceAttr);
}
}
}

for (auto [info, lowerValue, upperValue, stepValue] :
Expand Down Expand Up @@ -2505,11 +2542,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {

builder->setInsertionPointToEnd(loopWrapperOp.getBody());
auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
loc, nestLBs, nestUBs, nestSts, nestReduceOperands,
nestReduceAttrs.empty()
? nullptr
: mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs),
nullptr, /*local_vars=*/std::nullopt, /*local_syms=*/nullptr);
loc, nestLBs, nestUBs, nestSts, /*loopAnnotation=*/nullptr,
/*local_vars=*/std::nullopt,
/*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt,
/*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr,
/*reduce_attrs=*/nullptr);

llvm::SmallVector<mlir::Type> loopBlockArgTypes(
incrementLoopNestInfo.size(), builder->getIndexType());
Expand Down
Loading
Loading