#include "CGOpenMPRuntimeGPU.h"
#include "CodeGenFunction.h"
#include "clang/AST/Attr.h"
#include "clang/AST/DeclOpenMP.h"
#include "clang/AST/StmtOpenMP.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/Basic/Cuda.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/MathExtras.h"
using namespace clang;
using namespace CodeGen;
using namespace llvm::omp;
namespace {
class NVPTXActionTy final : public PrePostActionTy {
llvm::FunctionCallee EnterCallee = nullptr;
ArrayRef<llvm::Value *> EnterArgs;
llvm::FunctionCallee ExitCallee = nullptr;
ArrayRef<llvm::Value *> ExitArgs;
bool Conditional = false;
llvm::BasicBlock *ContBlock = nullptr;
public:
NVPTXActionTy(llvm::FunctionCallee EnterCallee,
ArrayRef<llvm::Value *> EnterArgs,
llvm::FunctionCallee ExitCallee,
ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
ExitArgs(ExitArgs), Conditional(Conditional) {}
void Enter(CodeGenFunction &CGF) override {
llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
if (Conditional) {
llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
ContBlock = CGF.createBasicBlock("omp_if.end");
CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
CGF.EmitBlock(ThenBlock);
}
}
void Done(CodeGenFunction &CGF) {
CGF.EmitBranch(ContBlock);
CGF.EmitBlock(ContBlock, true);
}
void Exit(CodeGenFunction &CGF) override {
CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
}
};
class ExecutionRuntimeModesRAII {
private:
CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
CGOpenMPRuntimeGPU::EM_Unknown;
CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
bool SavedRuntimeMode = false;
bool *RuntimeMode = nullptr;
public:
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
: ExecMode(ExecMode) {
SavedExecMode = ExecMode;
ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
}
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
bool &RuntimeMode, bool FullRuntimeMode)
: ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
SavedExecMode = ExecMode;
SavedRuntimeMode = RuntimeMode;
ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
RuntimeMode = FullRuntimeMode;
}
~ExecutionRuntimeModesRAII() {
ExecMode = SavedExecMode;
if (RuntimeMode)
*RuntimeMode = SavedRuntimeMode;
}
};
enum MachineConfiguration : unsigned {
GlobalMemoryAlignment = 128,
SharedMemorySize = 128,
};
static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
RefExpr = RefExpr->IgnoreParens();
if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
Base = TempOASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
}
RefExpr = RefExpr->IgnoreParenImpCasts();
if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
const auto *ME = cast<MemberExpr>(RefExpr);
return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
}
static RecordDecl *buildRecordForGlobalizedVars(
ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields, int BufSize) {
using VarsDataTy = std::pair<CharUnits , const ValueDecl *>;
if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
return nullptr;
SmallVector<VarsDataTy, 4> GlobalizedVars;
for (const ValueDecl *D : EscapedDecls)
GlobalizedVars.emplace_back(
CharUnits::fromQuantity(std::max(
C.getDeclAlign(D).getQuantity(),
static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
D);
for (const ValueDecl *D : EscapedDeclsForTeams)
GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
return L.first > R.first;
});
RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
GlobalizedRD->startDefinition();
llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
for (const auto &Pair : GlobalizedVars) {
const ValueDecl *VD = Pair.second;
QualType Type = VD->getType();
if (Type->isLValueReferenceType())
Type = C.getPointerType(Type.getNonReferenceType());
else
Type = Type.getNonReferenceType();
SourceLocation Loc = VD->getLocation();
FieldDecl *Field;
if (SingleEscaped.count(VD)) {
Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
nullptr, false,
ICIS_NoInit);
Field->setAccess(AS_public);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
Field->addAttr(*I);
}
} else {
llvm::APInt ArraySize(32, BufSize);
Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
0);
Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
nullptr, false,
ICIS_NoInit);
Field->setAccess(AS_public);
llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
static_cast<CharUnits::QuantityType>(
GlobalMemoryAlignment)));
Field->addAttr(AlignedAttr::CreateImplicit(
C, true,
IntegerLiteral::Create(C, Align,
C.getIntTypeForBitwidth(32, 0),
SourceLocation()),
{}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
}
GlobalizedRD->addDecl(Field);
MappedDeclsFields.try_emplace(VD, Field);
}
GlobalizedRD->completeDefinition();
return GlobalizedRD;
}
class CheckVarsEscapingDeclContext final
: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
CodeGenFunction &CGF;
llvm::SetVector<const ValueDecl *> EscapedDecls;
llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
RecordDecl *GlobalizedRD = nullptr;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
bool AllEscaped = false;
bool IsForCombinedParallelRegion = false;
void markAsEscaped(const ValueDecl *VD) {
if (!isa<VarDecl>(VD) ||
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
return;
VD = cast<ValueDecl>(VD->getCanonicalDecl());
if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
return;
if (auto *CSI = CGF.CapturedStmtInfo) {
if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
if (!IsForCombinedParallelRegion) {
if (!FD->hasAttrs())
return;
const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
if (!Attr)
return;
if (((Attr->getCaptureKind() != OMPC_map) &&
!isOpenMPPrivate(Attr->getCaptureKind())) ||
((Attr->getCaptureKind() == OMPC_map) &&
!FD->getType()->isAnyPointerType()))
return;
}
if (!FD->getType()->isReferenceType()) {
assert(!VD->getType()->isVariablyModifiedType() &&
"Parameter captured by value with variably modified type");
EscapedParameters.insert(VD);
} else if (!IsForCombinedParallelRegion) {
return;
}
}
}
if ((!CGF.CapturedStmtInfo ||
(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
VD->getType()->isReferenceType())
return;
if (VD->getType()->isVariablyModifiedType())
EscapedVariableLengthDecls.insert(VD);
else
EscapedDecls.insert(VD);
}
void VisitValueDecl(const ValueDecl *VD) {
if (VD->getType()->isLValueReferenceType())
markAsEscaped(VD);
if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = VD->getType()->isLValueReferenceType();
Visit(VarD->getInit());
AllEscaped = SavedAllEscaped;
}
}
}
void VisitOpenMPCapturedStmt(const CapturedStmt *S,
ArrayRef<OMPClause *> Clauses,
bool IsCombinedParallelRegion) {
if (!S)
return;
for (const CapturedStmt::Capture &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
if (IsCombinedParallelRegion) {
IsForCombinedParallelRegion = false;
for (const OMPClause *C : Clauses) {
if (!isOpenMPPrivate(C->getClauseKind()) ||
C->getClauseKind() == OMPC_reduction ||
C->getClauseKind() == OMPC_linear ||
C->getClauseKind() == OMPC_private)
continue;
ArrayRef<const Expr *> Vars;
if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
Vars = PC->getVarRefs();
else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
Vars = PC->getVarRefs();
else
llvm_unreachable("Unexpected clause.");
for (const auto *E : Vars) {
const Decl *D =
cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
if (D == VD->getCanonicalDecl()) {
IsForCombinedParallelRegion = true;
break;
}
}
if (IsForCombinedParallelRegion)
break;
}
}
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
}
}
}
void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
assert(!GlobalizedRD &&
"Record for globalized variables is built already.");
ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
if (IsInTTDRegion)
EscapedDeclsForTeams = EscapedDecls.getArrayRef();
else
EscapedDeclsForParallel = EscapedDecls.getArrayRef();
GlobalizedRD = ::buildRecordForGlobalizedVars(
CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
MappedDeclsFields, WarpSize);
}
public:
CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
ArrayRef<const ValueDecl *> TeamsReductions)
: CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
}
virtual ~CheckVarsEscapingDeclContext() = default;
void VisitDeclStmt(const DeclStmt *S) {
if (!S)
return;
for (const Decl *D : S->decls())
if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
VisitValueDecl(VD);
}
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
if (!D)
return;
if (!D->hasAssociatedStmt())
return;
if (const auto *S =
dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
VisitStmt(S->getCapturedStmt());
return;
}
VisitOpenMPCapturedStmt(
S, D->clauses(),
CaptureRegions.back() == OMPD_parallel &&
isOpenMPDistributeDirective(D->getDirectiveKind()));
}
}
void VisitCapturedStmt(const CapturedStmt *S) {
if (!S)
return;
for (const CapturedStmt::Capture &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
void VisitLambdaExpr(const LambdaExpr *E) {
if (!E)
return;
for (const LambdaCapture &C : E->captures()) {
if (C.capturesVariable()) {
if (C.getCaptureKind() == LCK_ByRef) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
}
void VisitBlockExpr(const BlockExpr *E) {
if (!E)
return;
for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
if (C.isByRef()) {
const VarDecl *VD = C.getVariable();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
VisitValueDecl(VD);
}
}
}
void VisitCallExpr(const CallExpr *E) {
if (!E)
return;
for (const Expr *Arg : E->arguments()) {
if (!Arg)
continue;
if (Arg->isLValue()) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(Arg);
AllEscaped = SavedAllEscaped;
} else {
Visit(Arg);
}
}
Visit(E->getCallee());
}
void VisitDeclRefExpr(const DeclRefExpr *E) {
if (!E)
return;
const ValueDecl *VD = E->getDecl();
if (AllEscaped)
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
else if (const auto *VarD = dyn_cast<VarDecl>(VD))
if (VarD->isInitCapture())
VisitValueDecl(VD);
}
void VisitUnaryOperator(const UnaryOperator *E) {
if (!E)
return;
if (E->getOpcode() == UO_AddrOf) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(E->getSubExpr());
AllEscaped = SavedAllEscaped;
} else {
Visit(E->getSubExpr());
}
}
void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
if (!E)
return;
if (E->getCastKind() == CK_ArrayToPointerDecay) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(E->getSubExpr());
AllEscaped = SavedAllEscaped;
} else {
Visit(E->getSubExpr());
}
}
void VisitExpr(const Expr *E) {
if (!E)
return;
bool SavedAllEscaped = AllEscaped;
if (!E->isLValue())
AllEscaped = false;
for (const Stmt *Child : E->children())
if (Child)
Visit(Child);
AllEscaped = SavedAllEscaped;
}
void VisitStmt(const Stmt *S) {
if (!S)
return;
for (const Stmt *Child : S->children())
if (Child)
Visit(Child);
}
const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
if (!GlobalizedRD)
buildRecordForGlobalizedVars(IsInTTDRegion);
return GlobalizedRD;
}
const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
assert(GlobalizedRD &&
"Record for globalized variables must be generated already.");
auto I = MappedDeclsFields.find(VD);
if (I == MappedDeclsFields.end())
return nullptr;
return I->getSecond();
}
ArrayRef<const ValueDecl *> getEscapedDecls() const {
return EscapedDecls.getArrayRef();
}
const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
return EscapedParameters;
}
ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
return EscapedVariableLengthDecls.getArrayRef();
}
};
}
static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
}
static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
"nvptx_lane_id");
}
CGOpenMPRuntimeGPU::ExecutionMode
CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
}
static CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode(CodeGenModule &CGM) {
return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
: CGOpenMPRuntimeGPU::Generic;
}
static bool hasNestedSPMDDirective(ASTContext &Ctx,
const OMPExecutableDirective &D) {
const auto *CS = D.getInnermostCapturedStmt();
const auto *Body =
CS->getCapturedStmt()->IgnoreContainers(true);
const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NestedDir =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
switch (D.getDirectiveKind()) {
case OMPD_target:
if (isOpenMPParallelDirective(DKind))
return true;
if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPParallelDirective(DKind))
return true;
}
}
return false;
case OMPD_target_teams:
return isOpenMPParallelDirective(DKind);
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
llvm_unreachable("Unexpected directive.");
}
}
return false;
}
static bool supportsSPMDExecutionMode(ASTContext &Ctx,
const OMPExecutableDirective &D) {
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
switch (DirectiveKind) {
case OMPD_target:
case OMPD_target_teams:
return hasNestedSPMDDirective(Ctx, D);
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_target_simd:
case OMPD_target_teams_distribute_simd:
return true;
case OMPD_target_teams_distribute:
return false;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
break;
}
llvm_unreachable(
"Unknown programming model for OpenMP directive on NVPTX target.");
}
static bool hasStaticScheduling(const OMPExecutableDirective &D) {
assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.");
return !D.hasClausesOfKind<OMPOrderedClause>() &&
(!D.hasClausesOfKind<OMPScheduleClause>() ||
llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
[](const OMPScheduleClause *C) {
return C->getScheduleKind() == OMPC_SCHEDULE_static;
}));
}
static bool hasNestedLightweightDirective(ASTContext &Ctx,
const OMPExecutableDirective &D) {
assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
const auto *CS = D.getInnermostCapturedStmt();
const auto *Body =
CS->getCapturedStmt()->IgnoreContainers(true);
const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NestedDir =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
switch (D.getDirectiveKind()) {
case OMPD_target:
if (isOpenMPParallelDirective(DKind) &&
isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
hasStaticScheduling(*NestedDir))
return true;
if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
return true;
if (DKind == OMPD_parallel) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
}
} else if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPParallelDirective(DKind) &&
isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
if (DKind == OMPD_parallel) {
Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
}
}
}
}
return false;
case OMPD_target_teams:
if (isOpenMPParallelDirective(DKind) &&
isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
hasStaticScheduling(*NestedDir))
return true;
if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
return true;
if (DKind == OMPD_parallel) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
}
}
return false;
case OMPD_target_parallel:
if (DKind == OMPD_simd)
return true;
return isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
case OMPD_target_teams_distribute:
case OMPD_target_simd:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
llvm_unreachable("Unexpected directive.");
}
}
return false;
}
static bool supportsLightweightRuntime(ASTContext &Ctx,
const OMPExecutableDirective &D) {
if (!supportsSPMDExecutionMode(Ctx, D))
return false;
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
switch (DirectiveKind) {
case OMPD_target:
case OMPD_target_teams:
case OMPD_target_parallel:
return hasNestedLightweightDirective(Ctx, D);
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
return hasStaticScheduling(D);
case OMPD_target_simd:
case OMPD_target_teams_distribute_simd:
return true;
case OMPD_target_teams_distribute:
return false;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
break;
}
llvm_unreachable(
"Unknown programming model for OpenMP directive on NVPTX target.");
}
void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
EntryFunctionState EST;
WrapperFunctionsMap.clear();
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: EST(EST) {}
void Enter(CodeGenFunction &CGF) override {
auto &RT =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.emitKernelInit(CGF, EST, false);
RT.setLocThreadIdInsertPt(CGF, true);
}
void Exit(CodeGenFunction &CGF) override {
auto &RT =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, false);
}
} Action(EST);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
}
void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
EntryFunctionState &EST, bool IsSPMD) {
CGBuilderTy &Bld = CGF.Builder;
Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
IsInTargetMasterThreadRegion = IsSPMD;
if (!IsSPMD)
emitGenericVarsProlog(CGF, EST.Loc);
}
void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
EntryFunctionState &EST,
bool IsSPMD) {
if (!IsSPMD)
emitGenericVarsEpilog(CGF);
CGBuilderTy &Bld = CGF.Builder;
OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
ExecutionRuntimeModesRAII ModeRAII(
CurrentExecutionMode, RequiresFullRuntime,
CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
!supportsLightweightRuntime(CGM.getContext(), D));
EntryFunctionState EST;
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU &RT;
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: RT(RT), EST(EST) {}
void Enter(CodeGenFunction &CGF) override {
RT.emitKernelInit(CGF, EST, true);
RT.setLocThreadIdInsertPt(CGF, true);
}
void Exit(CodeGenFunction &CGF) override {
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, true);
}
} Action(*this, EST);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
}
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
bool Mode) {
auto *GVMode = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, true,
llvm::GlobalValue::WeakAnyLinkage,
llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
: OMP_TGT_EXEC_MODE_GENERIC),
Twine(Name, "_exec_mode"));
CGM.addCompilerUsedGlobal(GVMode);
}
void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
llvm::Constant *Addr,
uint64_t Size, int32_t,
llvm::GlobalValue::LinkageTypes) {
llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
if (!Fn)
return;
llvm::Module &M = CGM.getModule();
llvm::LLVMContext &Ctx = CGM.getLLVMContext();
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
llvm::Metadata *MDVals[] = {
llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx, "kernel"),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
}
void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
if (!IsOffloadEntry) return;
assert(!ParentName.empty() && "Invalid target region parent name!");
bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
if (Mode)
emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
else
emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
}
namespace {
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
enum ModeFlagsTy : unsigned {
KMP_IDENT_SPMD_MODE = 0x01,
KMP_IDENT_SIMPLE_RT_MODE = 0x02,
LLVM_MARK_AS_BITMASK_ENUM(KMP_IDENT_SIMPLE_RT_MODE)
};
static const ModeFlagsTy UndefinedMode =
(~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
}
unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
switch (getExecutionMode()) {
case EM_SPMD:
if (requiresFullRuntime())
return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
case EM_NonSPMD:
assert(requiresFullRuntime() && "Expected full runtime.");
return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
case EM_Unknown:
return UndefinedMode;
}
llvm_unreachable("Unknown flags are requested.");
}
CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
: CGOpenMPRuntime(CGM, "_", "$") {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP can only handle device code.");
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
return;
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
"__omp_rtl_debug_kind");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
"__omp_rtl_assume_teams_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
"__omp_rtl_assume_threads_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
"__omp_rtl_assume_no_thread_state");
}
void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
ProcBindKind ProcBind,
SourceLocation Loc) {
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
return;
CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
}
void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc) {
}
void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {}
llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
class NVPTXPrePostActionTy : public PrePostActionTy {
bool &IsInParallelRegion;
bool PrevIsInParallelRegion;
public:
NVPTXPrePostActionTy(bool &IsInParallelRegion)
: IsInParallelRegion(IsInParallelRegion) {}
void Enter(CodeGenFunction &CGF) override {
PrevIsInParallelRegion = IsInParallelRegion;
IsInParallelRegion = true;
}
void Exit(CodeGenFunction &CGF) override {
IsInParallelRegion = PrevIsInParallelRegion;
}
} Action(IsInParallelRegion);
CodeGen.setAction(Action);
bool PrevIsInTTDRegion = IsInTTDRegion;
IsInTTDRegion = false;
bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
IsInTargetMasterThreadRegion = false;
auto *OutlinedFun =
cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen));
IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
IsInTTDRegion = PrevIsInTTDRegion;
if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
!IsInParallelRegion) {
llvm::Function *WrapperFun =
createParallelDataSharingWrapper(OutlinedFun, D);
WrapperFunctionsMap[OutlinedFun] = WrapperFun;
}
return OutlinedFun;
}
static void
getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
"expected teams directive.");
const OMPExecutableDirective *Dir = &D;
if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
Ctx,
D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
true))) {
Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
Dir = nullptr;
}
}
if (!Dir)
return;
for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
for (const Expr *E : C->getVarRefs())
Vars.push_back(getPrivateItem(E));
}
}
static void
getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
"expected teams directive.");
for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
for (const Expr *E : C->privates())
Vars.push_back(getPrivateItem(E));
}
}
llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
SourceLocation Loc = D.getBeginLoc();
const RecordDecl *GlobalizedRD = nullptr;
llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
if (!LastPrivatesReductions.empty()) {
GlobalizedRD = ::buildRecordForGlobalizedVars(
CGM.getContext(), llvm::None, LastPrivatesReductions,
MappedDeclsFields, WarpSize);
}
} else if (!LastPrivatesReductions.empty()) {
assert(!TeamAndReductions.first &&
"Previous team declaration is not expected.");
TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
std::swap(TeamAndReductions.second, LastPrivatesReductions);
}
class NVPTXPrePostActionTy : public PrePostActionTy {
SourceLocation &Loc;
const RecordDecl *GlobalizedRD;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields;
public:
NVPTXPrePostActionTy(
SourceLocation &Loc, const RecordDecl *GlobalizedRD,
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields)
: Loc(Loc), GlobalizedRD(GlobalizedRD),
MappedDeclsFields(MappedDeclsFields) {}
void Enter(CodeGenFunction &CGF) override {
auto &Rt =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (GlobalizedRD) {
auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().MappedParams =
std::make_unique<CodeGenFunction::OMPMapVars>();
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const auto &Pair : MappedDeclsFields) {
assert(Pair.getFirst()->isCanonicalDecl() &&
"Expected canonical declaration");
Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
}
}
Rt.emitGenericVarsProlog(CGF, Loc);
}
void Exit(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsEpilog(CGF);
}
} Action(Loc, GlobalizedRD, MappedDeclsFields);
CodeGen.setAction(Action);
llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen);
return OutlinedFun;
}
void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
SourceLocation Loc,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
return;
CGBuilderTy &Bld = CGF.Builder;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return;
for (auto &Rec : I->getSecond().LocalVarData) {
const auto *VD = cast<VarDecl>(Rec.first);
bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
QualType VarTy = VD->getType();
llvm::Value *ParValue;
if (EscapedParam) {
LValue ParLVal =
CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
}
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(llvm::Attribute::get(
CGM.getLLVMContext(), llvm::Attribute::Alignment,
CGM.getContext().getTargetInfo().getNewAlign() / 8));
llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
Rec.second.GlobalizedVal = VoidPtr;
if (EscapedParam) {
CGF.EmitStoreOfScalar(ParValue, VarAddr);
I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
}
if (auto *DI = CGF.getDebugInfo())
VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
}
for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
llvm::Value *Size = CGF.getTypeSize(VD->getType());
CharUnits Align = CGM.getContext().getDeclAlign(VD);
Size = Bld.CreateNUWAdd(
Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
llvm::Value *AlignVal =
llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
Size = Bld.CreateUDiv(Size, AlignVal);
Size = Bld.CreateNUWMul(Size, AlignVal);
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::CallBase *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
VoidPtr->addRetAttr(
llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
CGM.getContext().getTargetInfo().getNewAlign()));
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
std::pair<llvm::Value *, llvm::Value *>(
{VoidPtr, CGF.getTypeSize(VD->getType())}));
LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
CGM.getContext().getDeclAlign(VD),
AlignmentSource::Decl);
I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
Base.getAddress(CGF));
}
I->getSecond().MappedParams->apply(CGF);
}
void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
return;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
for (auto AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
}
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
CGF.getTypeSize(VD->getType())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
FreeArgs);
}
}
}
void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
".zero.addr");
CGF.Builder.CreateStore(CGF.Builder.getInt32( 0), ZeroAddr);
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
}
void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond,
llvm::Value *NumThreads) {
if (!CGF.HaveInsertPoint())
return;
auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
NumThreads](CodeGenFunction &CGF,
PrePostActionTy &Action) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *NumThreadsVal = NumThreads;
llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
if (WFn)
ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
"captured_vars_addrs");
if (!CapturedVars.empty()) {
ASTContext &Ctx = CGF.getContext();
unsigned Idx = 0;
for (llvm::Value *V : CapturedVars) {
Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
llvm::Value *PtrV;
if (V->getType()->isIntegerTy())
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
else
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
CGF.EmitStoreOfScalar(PtrV, Dst, false,
Ctx.getPointerType(Ctx.VoidPtrTy));
++Idx;
}
}
llvm::Value *IfCondVal = nullptr;
if (IfCond)
IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
false);
else
IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
if (!NumThreadsVal)
NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
else
NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
assert(IfCondVal && "Expected a value");
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *Args[] = {
RTLoc,
getThreadID(CGF, Loc),
IfCondVal,
NumThreadsVal,
llvm::ConstantInt::get(CGF.Int32Ty, -1),
FnPtr,
ID,
Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
CGF.VoidPtrPtrTy),
llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_parallel_51),
Args);
};
RegionCodeGenTy RCG(ParallelGen);
RCG(CGF);
}
void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
if (!CGF.HaveInsertPoint())
return;
llvm::Value *Args[] = {
llvm::ConstantPointerNull::get(
cast<llvm::PointerType>(getIdentTyPointerTy())),
llvm::ConstantInt::get(CGF.Int32Ty, 0, true)};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
Args);
}
void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
SourceLocation Loc,
OpenMPDirectiveKind Kind, bool,
bool) {
if (!CGF.HaveInsertPoint())
return;
unsigned Flags = getDefaultFlagsForBarriers(Kind);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
getThreadID(CGF, Loc)};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
Args);
}
void CGOpenMPRuntimeGPU::emitCriticalRegion(
CodeGenFunction &CGF, StringRef CriticalName,
const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
const Expr *Hint) {
llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
QualType Int32Ty =
CGF.getContext().getIntTypeForBitwidth(32, 0);
Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
true);
CGF.EmitBlock(LoopBB);
llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
CGF.EmitBlock(TestBB);
CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
llvm::Value *CmpThreadToCounter =
CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
CGF.EmitBlock(BodyBB);
CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
Hint);
CGF.EmitBlock(SyncBB);
(void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_syncwarp),
Mask);
llvm::Value *IncCounterVal =
CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
CGF.EmitBranch(LoopBB);
CGF.EmitBlock(ExitBB, true);
}
static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
QualType ValTy, QualType CastTy,
SourceLocation Loc) {
assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
"Cast type must sized.");
assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
"Val type must sized.");
llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
if (ValTy == CastTy)
return Val;
if (CGF.getContext().getTypeSizeInChars(ValTy) ==
CGF.getContext().getTypeSizeInChars(CastTy))
return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
if (CastTy->isIntegerType() && ValTy->isIntegerType())
return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
CastTy->hasSignedIntegerRepresentation());
Address CastItem = CGF.CreateMemTemp(CastTy);
Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
Val->getType());
CGF.EmitStoreOfScalar(Val, ValCastItem, false, ValTy,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
return CGF.EmitLoadOfScalar(CastItem, false, CastTy, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
}
static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
llvm::Value *Elem,
QualType ElemType,
llvm::Value *Offset,
SourceLocation Loc) {
CodeGenModule &CGM = CGF.CGM;
CGBuilderTy &Bld = CGF.Builder;
CGOpenMPRuntimeGPU &RT =
*(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
assert(Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.");
RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
? OMPRTL___kmpc_shuffle_int32
: OMPRTL___kmpc_shuffle_int64;
QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
Size.getQuantity() <= 4 ? 32 : 64, 1);
llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
llvm::Value *WarpSize =
Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, true);
llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
{ElemCast, Offset, WarpSize});
return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
}
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
Address DestAddr, QualType ElemType,
llvm::Value *Offset, SourceLocation Loc) {
CGBuilderTy &Bld = CGF.Builder;
CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
Address ElemPtr = DestAddr;
Address Ptr = SrcAddr;
Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
if (Size < CharUnits::fromQuantity(IntSize))
continue;
QualType IntType = CGF.getContext().getIntTypeForBitwidth(
CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1);
llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
IntTy);
ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
ElemPtr, IntTy->getPointerTo(), IntTy);
if (Size.getQuantity() / IntSize > 1) {
llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
CGF.EmitBlock(PreCondBB);
llvm::PHINode *PhiSrc =
Bld.CreatePHI(Ptr.getType(), 2);
PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
llvm::PHINode *PhiDest =
Bld.CreatePHI(ElemPtr.getType(), 2);
PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
ElemPtr =
Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
llvm::Value *PtrDiff = Bld.CreatePtrDiff(
CGF.Int8Ty, PtrEnd.getPointer(),
Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
CGF.VoidPtrTy));
Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
ThenBB, ExitBB);
CGF.EmitBlock(ThenBB);
llvm::Value *Res = createRuntimeShuffleFunction(
CGF,
CGF.EmitLoadOfScalar(Ptr, false, IntType, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo()),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, false, IntType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
CGF.EmitBranch(PreCondBB);
CGF.EmitBlock(ExitBB);
} else {
llvm::Value *Res = createRuntimeShuffleFunction(
CGF,
CGF.EmitLoadOfScalar(Ptr, false, IntType, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo()),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, false, IntType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Ptr = Bld.CreateConstGEP(Ptr, 1);
ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
}
Size = Size % IntSize;
}
}
namespace {
enum CopyAction : unsigned {
RemoteLaneToThread,
ThreadCopy,
ThreadToScratchpad,
ScratchpadToThread,
};
}
struct CopyOptionsTy {
llvm::Value *RemoteLaneOffset;
llvm::Value *ScratchpadIndex;
llvm::Value *ScratchpadWidth;
};
static void emitReductionListCopy(
CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
CodeGenModule &CGM = CGF.CGM;
ASTContext &C = CGM.getContext();
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
unsigned Idx = 0;
unsigned Size = Privates.size();
for (const Expr *Private : Privates) {
Address SrcElementAddr = Address::invalid();
Address DestElementAddr = Address::invalid();
Address DestElementPtrAddr = Address::invalid();
bool ShuffleInElement = false;
bool UpdateDestListPtr = false;
bool IncrScratchpadSrc = false;
bool IncrScratchpadDest = false;
QualType PrivatePtrType = C.getPointerType(Private->getType());
llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
switch (Action) {
case RemoteLaneToThread: {
Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
SrcElementAddr =
CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
SrcElementPtrAddr, PrivateLlvmPtrType),
PrivatePtrType->castAs<PointerType>());
DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
DestElementAddr =
CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
ShuffleInElement = true;
UpdateDestListPtr = true;
break;
}
case ThreadCopy: {
Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
SrcElementAddr =
CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
SrcElementPtrAddr, PrivateLlvmPtrType),
PrivatePtrType->castAs<PointerType>());
DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
DestElementAddr =
CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
DestElementPtrAddr, PrivateLlvmPtrType),
PrivatePtrType->castAs<PointerType>());
break;
}
case ThreadToScratchpad: {
Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
SrcElementAddr =
CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
SrcElementPtrAddr, PrivateLlvmPtrType),
PrivatePtrType->castAs<PointerType>());
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
llvm::Value *CurrentOffset =
Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
llvm::Value *ScratchPadElemAbsolutePtrVal =
Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
ScratchPadElemAbsolutePtrVal =
Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
C.getTypeAlignInChars(Private->getType()));
IncrScratchpadDest = true;
break;
}
case ScratchpadToThread: {
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
llvm::Value *CurrentOffset =
Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
llvm::Value *ScratchPadElemAbsolutePtrVal =
Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
ScratchPadElemAbsolutePtrVal =
Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
C.getTypeAlignInChars(Private->getType()));
IncrScratchpadSrc = true;
DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
DestElementAddr =
CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
UpdateDestListPtr = true;
break;
}
}
SrcElementAddr = Bld.CreateElementBitCast(
SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
SrcElementAddr.getElementType());
if (ShuffleInElement) {
shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
RemoteLaneOffset, Private->getExprLoc());
} else {
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *Elem = CGF.EmitLoadOfScalar(
SrcElementAddr, false, Private->getType(),
Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
CGF.EmitStoreOfScalar(
Elem, DestElementAddr, false, Private->getType(),
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
break;
}
case TEK_Complex: {
CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
Private->getExprLoc());
CGF.EmitStoreOfComplex(
Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
false);
break;
}
case TEK_Aggregate:
CGF.EmitAggregateCopy(
CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
Private->getType(), AggValueSlot::DoesNotOverlap);
break;
}
}
if (UpdateDestListPtr) {
CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
DestElementAddr.getPointer(), CGF.VoidPtrTy),
DestElementPtrAddr, false,
C.VoidPtrTy);
}
if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
llvm::Value *ScratchpadBasePtr =
IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
ScratchpadBasePtr = Bld.CreateNUWAdd(
ScratchpadBasePtr,
Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
ScratchpadBasePtr = Bld.CreateNUWSub(
ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateUDiv(
ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
ScratchpadBasePtr = Bld.CreateNUWAdd(
ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateNUWMul(
ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
if (IncrScratchpadDest)
DestBase =
Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
else
SrcBase =
Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
}
++Idx;
}
}
static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy,
SourceLocation Loc) {
ASTContext &C = CGM.getContext();
llvm::Module &M = CGM.getModule();
ImplicitParamDecl ReduceListArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
ImplicitParamDecl NumWarpsArg(C, nullptr, Loc, nullptr,
C.getIntTypeForBitwidth(32, true),
ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&ReduceListArg);
Args.push_back(&NumWarpsArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
llvm::GlobalValue::InternalLinkage,
"_omp_reduction_inter_warp_copy_func", &M);
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
StringRef TransferMediumName =
"__openmp_nvptx_data_transfer_temporary_storage";
llvm::GlobalVariable *TransferMedium =
M.getGlobalVariable(TransferMediumName);
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
if (!TransferMedium) {
auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
TransferMedium = new llvm::GlobalVariable(
M, Ty, false, llvm::GlobalVariable::WeakAnyLinkage,
llvm::UndefValue::get(Ty), TransferMediumName,
nullptr, llvm::GlobalVariable::NotThreadLocal,
SharedAddressSpace);
CGM.addCompilerUsedGlobal(TransferMedium);
}
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
llvm::Value *LaneID = getNVPTXLaneID(CGF);
llvm::Value *WarpID = getNVPTXWarpID(CGF);
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(
AddrReduceListArg, false, C.VoidPtrTy, Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
ElemTy->getPointerTo()),
ElemTy, CGF.getPointerAlign());
unsigned Idx = 0;
for (const Expr *Private : Privates) {
unsigned RealTySize =
C.getTypeSizeInChars(Private->getType())
.alignTo(C.getTypeAlignInChars(Private->getType()))
.getQuantity();
for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
unsigned NumIters = RealTySize / TySize;
if (NumIters == 0)
continue;
QualType CType = C.getIntTypeForBitwidth(
C.toBits(CharUnits::fromQuantity(TySize)), 1);
llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
CharUnits Align = CharUnits::fromQuantity(TySize);
llvm::Value *Cnt = nullptr;
Address CntAddr = Address::invalid();
llvm::BasicBlock *PrecondBB = nullptr;
llvm::BasicBlock *ExitBB = nullptr;
if (NumIters > 1) {
CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
false, C.IntTy);
PrecondBB = CGF.createBasicBlock("precond");
ExitBB = CGF.createBasicBlock("exit");
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(PrecondBB);
Cnt = CGF.EmitLoadOfScalar(CntAddr, false, C.IntTy, Loc);
llvm::Value *Cmp =
Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
CGF.EmitBlock(BodyBB);
}
CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
false,
true);
llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
CGF.EmitBlock(ThenBB);
Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
ElemPtrPtrAddr, false, C.VoidPtrTy, SourceLocation());
Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
if (NumIters > 1)
ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
TransferMedium->getValueType(), TransferMedium,
{llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
Address MediumPtr(
Bld.CreateBitCast(
MediumPtrVal,
CopyType->getPointerTo(
MediumPtrVal->getType()->getPointerAddressSpace())),
CopyType, Align);
llvm::Value *Elem = CGF.EmitLoadOfScalar(
ElemPtr, false, CType, Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
CGF.EmitStoreOfScalar(Elem, MediumPtr, true, CType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Bld.CreateBr(MergeBB);
CGF.EmitBlock(ElseBB);
Bld.CreateBr(MergeBB);
CGF.EmitBlock(MergeBB);
CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
false,
true);
llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
AddrNumWarpsArg, false, C.IntTy, Loc);
llvm::Value *IsActiveThread =
Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
CGF.EmitBlock(W0ThenBB);
llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
TransferMedium->getValueType(), TransferMedium,
{llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
Address SrcMediumPtr(
Bld.CreateBitCast(
SrcMediumPtrVal,
CopyType->getPointerTo(
SrcMediumPtrVal->getType()->getPointerAddressSpace())),
CopyType, Align);
Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
TargetElemPtrPtr, false, C.VoidPtrTy, Loc);
Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
if (NumIters > 1)
TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
llvm::Value *SrcMediumValue =
CGF.EmitLoadOfScalar(SrcMediumPtr, true, CType, Loc);
CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, false,
CType);
Bld.CreateBr(W0MergeBB);
CGF.EmitBlock(W0ElseBB);
Bld.CreateBr(W0MergeBB);
CGF.EmitBlock(W0MergeBB);
if (NumIters > 1) {
Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, 1));
CGF.EmitStoreOfScalar(Cnt, CntAddr, false, C.IntTy);
CGF.EmitBranch(PrecondBB);
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(ExitBB);
}
RealTySize %= TySize;
}
++Idx;
}
CGF.FinishFunction();
return Fn;
}
static llvm::Function *emitShuffleAndReduceFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
ASTContext &C = CGM.getContext();
ImplicitParamDecl ReduceListArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
ImplicitParamDecl LaneIDArg(C, nullptr, Loc, nullptr, C.ShortTy,
ImplicitParamDecl::Other);
ImplicitParamDecl RemoteLaneOffsetArg(C, nullptr, Loc, nullptr,
C.ShortTy, ImplicitParamDecl::Other);
ImplicitParamDecl AlgoVerArg(C, nullptr, Loc, nullptr,
C.ShortTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&ReduceListArg);
Args.push_back(&LaneIDArg);
Args.push_back(&RemoteLaneOffsetArg);
Args.push_back(&AlgoVerArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, false,
C.VoidPtrTy, SourceLocation()),
ElemTy->getPointerTo()),
ElemTy, CGF.getPointerAlign());
Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
AddrLaneIDArg, false, C.ShortTy, SourceLocation());
Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
AddrRemoteLaneOffsetArg, false, C.ShortTy, SourceLocation());
Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
AddrAlgoVerArg, false, C.ShortTy, SourceLocation());
Address RemoteReduceList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
LocalReduceList, RemoteReduceList,
{RemoteLaneOffsetArgVal,
nullptr,
nullptr});
llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
llvm::Value *CondAlgo1 = Bld.CreateAnd(
Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
llvm::Value *CondAlgo2 = Bld.CreateAnd(
Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
CondAlgo2 = Bld.CreateAnd(
CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
CGF.EmitBlock(ThenBB);
llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
LocalReduceList.getPointer(), CGF.VoidPtrTy);
llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
RemoteReduceList.getPointer(), CGF.VoidPtrTy);
CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Bld.CreateBr(MergeBB);
CGF.EmitBlock(ElseBB);
Bld.CreateBr(MergeBB);
CGF.EmitBlock(MergeBB);
Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
llvm::Value *CondCopy = Bld.CreateAnd(
Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
CGF.EmitBlock(CpyThenBB);
emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
RemoteReduceList, LocalReduceList);
Bld.CreateBr(CpyMergeBB);
CGF.EmitBlock(CpyElseBB);
Bld.CreateBr(CpyMergeBB);
CGF.EmitBlock(CpyMergeBB);
CGF.FinishFunction();
return Fn;
}
static llvm::Value *emitListToGlobalCopyFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, SourceLocation Loc,
const RecordDecl *TeamReductionRec,
const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&VarFieldMap) {
ASTContext &C = CGM.getContext();
ImplicitParamDecl BufferArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
ImplicitParamDecl IdxArg(C, nullptr, Loc, nullptr, C.IntTy,
ImplicitParamDecl::Other);
ImplicitParamDecl ReduceListArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&BufferArg);
Args.push_back(&IdxArg);
Args.push_back(&ReduceListArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_list_to_global_copy_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, false,
C.VoidPtrTy, Loc),
ElemTy->getPointerTo()),
ElemTy, CGF.getPointerAlign());
QualType StaticTy = C.getRecordType(TeamReductionRec);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrBufferArg, false, C.VoidPtrTy, Loc),
LLVMReductionsBufferTy->getPointerTo());
llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
false, C.IntTy,
Loc)};
unsigned Idx = 0;
for (const Expr *Private : Privates) {
Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
ElemPtrPtrAddr, false, C.VoidPtrTy, SourceLocation());
ElemTy = CGF.ConvertTypeForMem(Private->getType());
ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
ElemPtrPtr, ElemTy->getPointerTo());
Address ElemPtr =
Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
const FieldDecl *FD = VarFieldMap.lookup(VD);
LValue GlobLVal = CGF.EmitLValueForField(
CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
Address GlobAddr = GlobLVal.getAddress(CGF);
llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
GlobAddr.getPointer(), Idxs);
GlobLVal.setAddress(Address(BufferPtr,
CGF.ConvertTypeForMem(Private->getType()),
GlobAddr.getAlignment()));
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *V = CGF.EmitLoadOfScalar(
ElemPtr, false, Private->getType(), Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
CGF.EmitStoreOfScalar(V, GlobLVal);
break;
}
case TEK_Complex: {
CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
CGF.EmitStoreOfComplex(V, GlobLVal, false);
break;
}
case TEK_Aggregate:
CGF.EmitAggregateCopy(GlobLVal,
CGF.MakeAddrLValue(ElemPtr, Private->getType()),
Private->getType(), AggValueSlot::DoesNotOverlap);
break;
}
++Idx;
}
CGF.FinishFunction();
return Fn;
}
static llvm::Value *emitListToGlobalReduceFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, SourceLocation Loc,
const RecordDecl *TeamReductionRec,
const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&VarFieldMap,
llvm::Function *ReduceFn) {
ASTContext &C = CGM.getContext();
ImplicitParamDecl BufferArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
ImplicitParamDecl IdxArg(C, nullptr, Loc, nullptr, C.IntTy,
ImplicitParamDecl::Other);
ImplicitParamDecl ReduceListArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&BufferArg);
Args.push_back(&IdxArg);
Args.push_back(&ReduceListArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
QualType StaticTy = C.getRecordType(TeamReductionRec);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrBufferArg, false, C.VoidPtrTy, Loc),
LLVMReductionsBufferTy->getPointerTo());
Address ReductionList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
auto IPriv = Privates.begin();
llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
false, C.IntTy,
Loc)};
unsigned Idx = 0;
for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
const FieldDecl *FD = VarFieldMap.lookup(VD);
LValue GlobLVal = CGF.EmitLValueForField(
CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
Address GlobAddr = GlobLVal.getAddress(CGF);
llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
CGF.EmitStoreOfScalar(Ptr, Elem, false, C.VoidPtrTy);
if ((*IPriv)->getType()->isVariablyModifiedType()) {
++Idx;
Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
llvm::Value *Size = CGF.Builder.CreateIntCast(
CGF.getVLASize(
CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
.NumElts,
CGF.SizeTy, false);
CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
Elem);
}
}
llvm::Value *GlobalReduceList =
CGF.EmitCastToVoidPtr(ReductionList.getPointer());
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
AddrReduceListArg, false, C.VoidPtrTy, Loc);
CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
CGF.FinishFunction();
return Fn;
}
static llvm::Value *emitGlobalToListCopyFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, SourceLocation Loc,
const RecordDecl *TeamReductionRec,
const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&VarFieldMap) {
ASTContext &C = CGM.getContext();
ImplicitParamDecl BufferArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
ImplicitParamDecl IdxArg(C, nullptr, Loc, nullptr, C.IntTy,
ImplicitParamDecl::Other);
ImplicitParamDecl ReduceListArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&BufferArg);
Args.push_back(&IdxArg);
Args.push_back(&ReduceListArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_global_to_list_copy_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, false,
C.VoidPtrTy, Loc),
ElemTy->getPointerTo()),
ElemTy, CGF.getPointerAlign());
QualType StaticTy = C.getRecordType(TeamReductionRec);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrBufferArg, false, C.VoidPtrTy, Loc),
LLVMReductionsBufferTy->getPointerTo());
llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
false, C.IntTy,
Loc)};
unsigned Idx = 0;
for (const Expr *Private : Privates) {
Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
ElemPtrPtrAddr, false, C.VoidPtrTy, SourceLocation());
ElemTy = CGF.ConvertTypeForMem(Private->getType());
ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
ElemPtrPtr, ElemTy->getPointerTo());
Address ElemPtr =
Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
const FieldDecl *FD = VarFieldMap.lookup(VD);
LValue GlobLVal = CGF.EmitLValueForField(
CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
Address GlobAddr = GlobLVal.getAddress(CGF);
llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
GlobAddr.getPointer(), Idxs);
GlobLVal.setAddress(Address(BufferPtr,
CGF.ConvertTypeForMem(Private->getType()),
GlobAddr.getAlignment()));
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
CGF.EmitStoreOfScalar(V, ElemPtr, false, Private->getType(),
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
break;
}
case TEK_Complex: {
CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
false);
break;
}
case TEK_Aggregate:
CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
GlobLVal, Private->getType(),
AggValueSlot::DoesNotOverlap);
break;
}
++Idx;
}
CGF.FinishFunction();
return Fn;
}
static llvm::Value *emitGlobalToListReduceFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, SourceLocation Loc,
const RecordDecl *TeamReductionRec,
const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&VarFieldMap,
llvm::Function *ReduceFn) {
ASTContext &C = CGM.getContext();
ImplicitParamDecl BufferArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
ImplicitParamDecl IdxArg(C, nullptr, Loc, nullptr, C.IntTy,
ImplicitParamDecl::Other);
ImplicitParamDecl ReduceListArg(C, nullptr, Loc, nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&BufferArg);
Args.push_back(&IdxArg);
Args.push_back(&ReduceListArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
QualType StaticTy = C.getRecordType(TeamReductionRec);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrBufferArg, false, C.VoidPtrTy, Loc),
LLVMReductionsBufferTy->getPointerTo());
Address ReductionList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
auto IPriv = Privates.begin();
llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
false, C.IntTy,
Loc)};
unsigned Idx = 0;
for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
const FieldDecl *FD = VarFieldMap.lookup(VD);
LValue GlobLVal = CGF.EmitLValueForField(
CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
Address GlobAddr = GlobLVal.getAddress(CGF);
llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
CGF.EmitStoreOfScalar(Ptr, Elem, false, C.VoidPtrTy);
if ((*IPriv)->getType()->isVariablyModifiedType()) {
++Idx;
Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
llvm::Value *Size = CGF.Builder.CreateIntCast(
CGF.getVLASize(
CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
.NumElts,
CGF.SizeTy, false);
CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
Elem);
}
}
llvm::Value *GlobalReduceList =
CGF.EmitCastToVoidPtr(ReductionList.getPointer());
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
AddrReduceListArg, false, C.VoidPtrTy, Loc);
CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntimeGPU::emitReduction(
CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
if (!CGF.HaveInsertPoint())
return;
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
#ifndef NDEBUG
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
#endif
if (Options.SimpleReduction) {
assert(!TeamsReduction && !ParallelReduction &&
"Invalid reduction selection in emitReduction.");
CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
ReductionOps, Options);
return;
}
assert((TeamsReduction || ParallelReduction) &&
"Invalid reduction selection in emitReduction.");
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *ThreadId = getThreadID(CGF, Loc);
llvm::Value *Res;
ASTContext &C = CGM.getContext();
auto Size = RHSExprs.size();
for (const Expr *E : Privates) {
if (E->getType()->isVariablyModifiedType())
++Size;
}
llvm::APInt ArraySize(32, Size);
QualType ReductionArrayTy =
C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
0);
Address ReductionList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
auto IPriv = Privates.begin();
unsigned Idx = 0;
for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
CGF.Builder.CreateStore(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
Elem);
if ((*IPriv)->getType()->isVariablyModifiedType()) {
++Idx;
Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
llvm::Value *Size = CGF.Builder.CreateIntCast(
CGF.getVLASize(
CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
.NumElts,
CGF.SizeTy, false);
CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
Elem);
}
}
llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
ReductionList.getPointer(), CGF.VoidPtrTy);
llvm::Function *ReductionFn =
emitReductionFunction(Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
Privates, LHSExprs, RHSExprs, ReductionOps);
llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
llvm::Value *InterWarpCopyFn =
emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
if (ParallelReduction) {
llvm::Value *Args[] = {RTLoc,
ThreadId,
CGF.Builder.getInt32(RHSExprs.size()),
ReductionArrayTySize,
RL,
ShuffleAndReduceFn,
InterWarpCopyFn};
Res = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
Args);
} else {
assert(TeamsReduction && "expected teams reduction.");
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
int Cnt = 0;
for (const Expr *DRE : Privates) {
PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
++Cnt;
}
const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
C.getLangOpts().OpenMPCUDAReductionBufNum);
TeamsReductions.push_back(TeamReductionRec);
if (!KernelTeamsReductionPtr) {
KernelTeamsReductionPtr = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_teams_reductions_buffer_$_$ptr");
}
llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
false, C.getPointerType(C.VoidPtrTy), Loc);
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
ReductionFn);
llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
ReductionFn);
llvm::Value *Args[] = {
RTLoc,
ThreadId,
GlobalBufferPtr,
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
RL,
ShuffleAndReduceFn,
InterWarpCopyFn,
GlobalToBufferCpyFn,
GlobalToBufferRedFn,
BufferToGlobalCpyFn,
BufferToGlobalRedFn};
Res = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
Args);
}
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
Res, llvm::ConstantInt::get(CGM.Int32Ty, 1));
CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
CGF.EmitBlock(ThenBB);
auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
this](CodeGenFunction &CGF, PrePostActionTy &Action) {
auto IPriv = Privates.begin();
auto ILHS = LHSExprs.begin();
auto IRHS = RHSExprs.begin();
for (const Expr *E : ReductionOps) {
emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
cast<DeclRefExpr>(*IRHS));
++IPriv;
++ILHS;
++IRHS;
}
};
llvm::Value *EndArgs[] = {ThreadId};
RegionCodeGenTy RCG(CodeGen);
NVPTXActionTy Action(
nullptr, llvm::None,
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
EndArgs);
RCG.setAction(Action);
RCG(CGF);
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(ExitBB, true);
}
const VarDecl *
CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
const VarDecl *NativeParam) const {
if (!NativeParam->getType()->isReferenceType())
return NativeParam;
QualType ArgType = NativeParam->getType();
QualifierCollector QC;
const Type *NonQualTy = QC.strip(ArgType);
QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
if (Attr->getCaptureKind() == OMPC_map) {
PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
LangAS::opencl_global);
}
}
ArgType = CGM.getContext().getPointerType(PointeeTy);
QC.addRestrict();
enum { NVPTX_local_addr = 5 };
QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
ArgType = QC.apply(CGM.getContext(), ArgType);
if (isa<ImplicitParamDecl>(NativeParam))
return ImplicitParamDecl::Create(
CGM.getContext(), nullptr, NativeParam->getLocation(),
NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
return ParmVarDecl::Create(
CGM.getContext(),
const_cast<DeclContext *>(NativeParam->getDeclContext()),
NativeParam->getBeginLoc(), NativeParam->getLocation(),
NativeParam->getIdentifier(), ArgType,
nullptr, SC_None, nullptr);
}
Address
CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
const VarDecl *NativeParam,
const VarDecl *TargetParam) const {
assert(NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() &&
"Native arg must not be the same as target arg.");
Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
QualType NativeParamType = NativeParam->getType();
QualifierCollector QC;
const Type *NonQualTy = QC.strip(NativeParamType);
QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
unsigned NativePointeeAddrSpace =
CGF.getContext().getTargetAddressSpace(NativePointeeTy);
QualType TargetTy = TargetParam->getType();
llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
LocalAddr, false, TargetTy, SourceLocation());
TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
TargetAddr, llvm::PointerType::getWithSamePointeeType(
cast<llvm::PointerType>(TargetAddr->getType()), 0));
TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
TargetAddr, llvm::PointerType::getWithSamePointeeType(
cast<llvm::PointerType>(TargetAddr->getType()),
NativePointeeAddrSpace));
Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, false,
NativeParamType);
return NativeParamAddr;
}
void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
ArrayRef<llvm::Value *> Args) const {
SmallVector<llvm::Value *, 4> TargetArgs;
TargetArgs.reserve(Args.size());
auto *FnType = OutlinedFn.getFunctionType();
for (unsigned I = 0, E = Args.size(); I < E; ++I) {
if (FnType->isVarArg() && FnType->getNumParams() <= I) {
TargetArgs.append(std::next(Args.begin(), I), Args.end());
break;
}
llvm::Type *TargetType = FnType->getParamType(I);
llvm::Value *NativeArg = Args[I];
if (!TargetType->isPointerTy()) {
TargetArgs.emplace_back(NativeArg);
continue;
}
llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
NativeArg, llvm::PointerType::getWithSamePointeeType(
cast<llvm::PointerType>(NativeArg->getType()), 0));
TargetArgs.emplace_back(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
}
CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
}
llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
ASTContext &Ctx = CGM.getContext();
const auto &CS = *D.getCapturedStmt(OMPD_parallel);
FunctionArgList WrapperArgs;
QualType Int16QTy =
Ctx.getIntTypeForBitwidth(16, false);
QualType Int32QTy =
Ctx.getIntTypeForBitwidth(32, false);
ImplicitParamDecl ParallelLevelArg(Ctx, nullptr, D.getBeginLoc(),
nullptr, Int16QTy,
ImplicitParamDecl::Other);
ImplicitParamDecl WrapperArg(Ctx, nullptr, D.getBeginLoc(),
nullptr, Int32QTy,
ImplicitParamDecl::Other);
WrapperArgs.emplace_back(&ParallelLevelArg);
WrapperArgs.emplace_back(&WrapperArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
Fn->addFnAttr(llvm::Attribute::NoInline);
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM, true);
CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
D.getBeginLoc(), D.getBeginLoc());
const auto *RD = CS.getCapturedRecordDecl();
auto CurField = RD->field_begin();
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
".zero.addr");
CGF.Builder.CreateStore(CGF.Builder.getInt32( 0), ZeroAddr);
SmallVector<llvm::Value *, 8> Args;
Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
Args.emplace_back(ZeroAddr.getPointer());
CGBuilderTy &Bld = CGF.Builder;
auto CI = CS.capture_begin();
Address GlobalArgs =
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
DataSharingArgs);
Address SharedArgListAddress = Address::invalid();
if (CS.capture_size() > 0 ||
isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
SharedArgListAddress = CGF.EmitLoadOfPointer(
GlobalArgs, CGF.getContext()
.getPointerType(CGF.getContext().VoidPtrTy)
.castAs<PointerType>());
}
unsigned Idx = 0;
if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
llvm::Value *LB = CGF.EmitLoadOfScalar(
TypedAddress,
false,
CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
Args.emplace_back(LB);
++Idx;
Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
llvm::Value *UB = CGF.EmitLoadOfScalar(
TypedAddress,
false,
CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
Args.emplace_back(UB);
++Idx;
}
if (CS.capture_size() > 0) {
ASTContext &CGFContext = CGF.getContext();
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
QualType ElemTy = CurField->getType();
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
CGF.ConvertTypeForMem(ElemTy));
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
false,
CGFContext.getPointerType(ElemTy),
CI->getLocation());
if (CI->capturesVariableByCopy() &&
!CI->getCapturedVar()->getType()->isAnyPointerType()) {
Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
CI->getLocation());
}
Args.emplace_back(Arg);
}
}
emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
const Decl *D) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
return;
assert(D && "Expected function or captured|block decl.");
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
"Function is registered already.");
assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
"Team is set but not processed.");
const Stmt *Body = nullptr;
bool NeedToDelayGlobalization = false;
if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
Body = FD->getBody();
} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
Body = BD->getBody();
} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
Body = CD->getBody();
NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
if (NeedToDelayGlobalization &&
getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
return;
}
if (!Body)
return;
CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
VarChecker.Visit(Body);
const RecordDecl *GlobalizedVarsRecord =
VarChecker.getGlobalizedRecord(IsInTTDRegion);
TeamAndReductions.first = nullptr;
TeamAndReductions.second.clear();
ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
VarChecker.getEscapedVariableLengthDecls();
if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
return;
auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().MappedParams =
std::make_unique<CodeGenFunction::OMPMapVars>();
I->getSecond().EscapedParameters.insert(
VarChecker.getEscapedParameters().begin(),
VarChecker.getEscapedParameters().end());
I->getSecond().EscapedVariableLengthDecls.append(
EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Data.insert(std::make_pair(VD, MappedVarData()));
}
if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
VarChecker.Visit(Body);
I->getSecond().SecondaryLocalVarData.emplace();
DeclToAddrMapTy &Data = *I->getSecond().SecondaryLocalVarData;
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Data.insert(std::make_pair(VD, MappedVarData()));
}
}
if (!NeedToDelayGlobalization) {
emitGenericVarsProlog(CGF, D->getBeginLoc(), true);
struct GlobalizationScope final : EHScopeStack::Cleanup {
GlobalizationScope() = default;
void Emit(CodeGenFunction &CGF, Flags flags) override {
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsEpilog(CGF, true);
}
};
CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
}
}
Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) {
if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
auto AS = LangAS::Default;
switch (A->getAllocatorType()) {
case OMPAllocateDeclAttr::OMPNullMemAlloc:
case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
case OMPAllocateDeclAttr::OMPThreadMemAlloc:
case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
return Address::invalid();
case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
return Address::invalid();
case OMPAllocateDeclAttr::OMPConstMemAlloc:
AS = LangAS::cuda_constant;
break;
case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
AS = LangAS::cuda_shared;
break;
case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
break;
}
llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), VarTy, false,
llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
VD->getName(),
nullptr, llvm::GlobalValue::NotThreadLocal,
CGM.getContext().getTargetAddressSpace(AS));
CharUnits Align = CGM.getContext().getDeclAlign(VD);
GV->setAlignment(Align.getAsAlign());
return Address(
CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
VD->getType().getAddressSpace()))),
VarTy, Align);
}
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
return Address::invalid();
VD = VD->getCanonicalDecl();
auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return Address::invalid();
auto VDI = I->getSecond().LocalVarData.find(VD);
if (VDI != I->getSecond().LocalVarData.end())
return VDI->second.PrivateAddr;
if (VD->hasAttrs()) {
for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
E(VD->attr_end());
IT != E; ++IT) {
auto VDI = I->getSecond().LocalVarData.find(
cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
->getCanonicalDecl());
if (VDI != I->getSecond().LocalVarData.end())
return VDI->second.PrivateAddr;
}
}
return Address::invalid();
}
void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
FunctionGlobalizedDecls.erase(CGF.CurFn);
CGOpenMPRuntime::functionFinished(CGF);
}
void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
CodeGenFunction &CGF, const OMPLoopDirective &S,
OpenMPDistScheduleClauseKind &ScheduleKind,
llvm::Value *&Chunk) const {
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
ScheduleKind = OMPC_DIST_SCHEDULE_static;
Chunk = CGF.EmitScalarConversion(
RT.getGPUNumThreads(CGF),
CGF.getContext().getIntTypeForBitwidth(32, 0),
S.getIterationVariable()->getType(), S.getBeginLoc());
return;
}
CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
CGF, S, ScheduleKind, Chunk);
}
void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
CodeGenFunction &CGF, const OMPLoopDirective &S,
OpenMPScheduleClauseKind &ScheduleKind,
const Expr *&ChunkExpr) const {
ScheduleKind = OMPC_SCHEDULE_static;
llvm::APInt ChunkSize(32, 1);
ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
CGF.getContext().getIntTypeForBitwidth(32, 0),
SourceLocation());
}
void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
" Expected target-based directive.");
const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
for (const CapturedStmt::Capture &C : CS->captures()) {
if (!C.capturesVariable())
continue;
const VarDecl *VD = C.getCapturedVar();
const auto *RD = VD->getType()
.getCanonicalType()
.getNonReferenceType()
->getAsCXXRecordDecl();
if (!RD || !RD->isLambda())
continue;
Address VDAddr = CGF.GetAddrOfLocalVar(VD);
LValue VDLVal;
if (VD->getType().getCanonicalType()->isReferenceType())
VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
else
VDLVal = CGF.MakeAddrLValue(
VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
FieldDecl *ThisCapture = nullptr;
RD->getCaptureFields(Captures, ThisCapture);
if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
LValue ThisLVal =
CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
llvm::Value *CXXThis = CGF.LoadCXXThis();
CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
}
for (const LambdaCapture &LC : RD->captures()) {
if (LC.getCaptureKind() != LCK_ByRef)
continue;
const VarDecl *VD = LC.getCapturedVar();
if (!CS->capturesVariable(VD))
continue;
auto It = Captures.find(VD);
assert(It != Captures.end() && "Found lambda capture without field.");
LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
Address VDAddr = CGF.GetAddrOfLocalVar(VD);
if (VD->getType().getCanonicalType()->isReferenceType())
VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
VD->getType().getCanonicalType())
.getAddress(CGF);
CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
}
}
}
bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
LangAS &AS) {
if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
return false;
const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
switch(A->getAllocatorType()) {
case OMPAllocateDeclAttr::OMPNullMemAlloc:
case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
case OMPAllocateDeclAttr::OMPThreadMemAlloc:
case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
AS = LangAS::Default;
return true;
case OMPAllocateDeclAttr::OMPConstMemAlloc:
AS = LangAS::cuda_constant;
return true;
case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
AS = LangAS::cuda_shared;
return true;
case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
llvm_unreachable("Expected predefined allocator for the variables with the "
"static storage.");
}
return false;
}
static CudaArch getCudaArch(CodeGenModule &CGM) {
if (!CGM.getTarget().hasFeature("ptx"))
return CudaArch::UNKNOWN;
for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
if (Feature.getValue()) {
CudaArch Arch = StringToCudaArch(Feature.getKey());
if (Arch != CudaArch::UNKNOWN)
return Arch;
}
}
return CudaArch::UNKNOWN;
}
void CGOpenMPRuntimeGPU::processRequiresDirective(
const OMPRequiresDecl *D) {
for (const OMPClause *Clause : D->clauselists()) {
if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
CudaArch Arch = getCudaArch(CGM);
switch (Arch) {
case CudaArch::SM_20:
case CudaArch::SM_21:
case CudaArch::SM_30:
case CudaArch::SM_32:
case CudaArch::SM_35:
case CudaArch::SM_37:
case CudaArch::SM_50:
case CudaArch::SM_52:
case CudaArch::SM_53: {
SmallString<256> Buffer;
llvm::raw_svector_ostream Out(Buffer);
Out << "Target architecture " << CudaArchToString(Arch)
<< " does not support unified addressing";
CGM.Error(Clause->getBeginLoc(), Out.str());
return;
}
case CudaArch::SM_60:
case CudaArch::SM_61:
case CudaArch::SM_62:
case CudaArch::SM_70:
case CudaArch::SM_72:
case CudaArch::SM_75:
case CudaArch::SM_80:
case CudaArch::SM_86:
case CudaArch::GFX600:
case CudaArch::GFX601:
case CudaArch::GFX602:
case CudaArch::GFX700:
case CudaArch::GFX701:
case CudaArch::GFX702:
case CudaArch::GFX703:
case CudaArch::GFX704:
case CudaArch::GFX705:
case CudaArch::GFX801:
case CudaArch::GFX802:
case CudaArch::GFX803:
case CudaArch::GFX805:
case CudaArch::GFX810:
case CudaArch::GFX900:
case CudaArch::GFX902:
case CudaArch::GFX904:
case CudaArch::GFX906:
case CudaArch::GFX908:
case CudaArch::GFX909:
case CudaArch::GFX90a:
case CudaArch::GFX90c:
case CudaArch::GFX940:
case CudaArch::GFX1010:
case CudaArch::GFX1011:
case CudaArch::GFX1012:
case CudaArch::GFX1013:
case CudaArch::GFX1030:
case CudaArch::GFX1031:
case CudaArch::GFX1032:
case CudaArch::GFX1033:
case CudaArch::GFX1034:
case CudaArch::GFX1035:
case CudaArch::GFX1036:
case CudaArch::GFX1100:
case CudaArch::GFX1101:
case CudaArch::GFX1102:
case CudaArch::GFX1103:
case CudaArch::Generic:
case CudaArch::UNUSED:
case CudaArch::UNKNOWN:
break;
case CudaArch::LAST:
llvm_unreachable("Unexpected Cuda arch.");
}
}
}
CGOpenMPRuntime::processRequiresDirective(D);
}
void CGOpenMPRuntimeGPU::clear() {
if (!TeamsReductions.empty()) {
ASTContext &C = CGM.getContext();
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
QualType RecTy = C.getRecordType(TeamReductionRec);
auto *Field = FieldDecl::Create(
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
nullptr, false,
ICIS_NoInit);
Field->setAccess(AS_public);
StaticRD->addDecl(Field);
}
StaticRD->completeDefinition();
QualType StaticTy = C.getRecordType(StaticRD);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMReductionsBufferTy,
false, llvm::GlobalValue::InternalLinkage,
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
"_openmp_teams_reductions_buffer_$_");
KernelTeamsReductionPtr->setInitializer(
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
CGM.VoidPtrTy));
}
CGOpenMPRuntime::clear();
}
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();
const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
llvm::Function *F = M->getFunction(LocSize);
if (!F) {
F = llvm::Function::Create(
llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
}
return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
}
llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
ArrayRef<llvm::Value *> Args{};
return CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
Args);
}
llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
ArrayRef<llvm::Value *> Args{};
return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_get_warp_size),
Args);
}