//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This pass replaces accesses to kernel arguments with loads from /// offsets from the kernarg base pointer. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" using namespace llvm; namespace { class PreloadKernelArgInfo { private: Function &F; const GCNSubtarget &ST; unsigned NumFreeUserSGPRs; public: SmallVector KernelArgMetadata; PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { setInitialFreeUserSGPRsCount(); } // Returns the maximum number of user SGPRs that we have available to preload // arguments. void setInitialFreeUserSGPRsCount() { const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs(); GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs(); } bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset, uint64_t LastExplicitArgOffset) { // Check if this argument may be loaded into the same register as the // previous argument. if (!isAligned(Align(4), ArgOffset) && AllocSize < 4) return true; // Pad SGPRs for kernarg alignment. unsigned Padding = ArgOffset - LastExplicitArgOffset; unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) return false; NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); return true; } }; class AMDGPULowerKernelArguments : public FunctionPass { public: static char ID; AMDGPULowerKernelArguments() : FunctionPass(ID) {} bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesAll(); } }; } // end anonymous namespace // skip allocas static BasicBlock::iterator getInsertPt(BasicBlock &BB) { BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { AllocaInst *AI = dyn_cast(&*InsPt); // If this is a dynamic alloca, the value may depend on the loaded kernargs, // so loads will need to be inserted before it. if (!AI || !AI->isStaticAlloca()) break; } return InsPt; } static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) return false; const GCNSubtarget &ST = TM.getSubtarget(F); LLVMContext &Ctx = F.getParent()->getContext(); const DataLayout &DL = F.getDataLayout(); BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock)); const Align KernArgBaseAlign(16); // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); Align MaxAlign; // FIXME: Alignment is broken with explicit arg offset.; const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) return false; CallInst *KernArgSegment = Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, nullptr, F.getName() + ".kernarg.segment"); KernArgSegment->addRetAttr(Attribute::NonNull); KernArgSegment->addRetAttr( Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; // Preloaded kernel arguments must be sequential. bool InPreloadSequence = true; PreloadKernelArgInfo PreloadInfo(F, ST); for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy); uint64_t Size = DL.getTypeSizeInBits(ArgTy); uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; // Try to preload this argument into user SGPRs. if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() && !Arg.getType()->isAggregateType()) if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset, LastExplicitArgOffset)) continue; InPreloadSequence = false; if (Arg.use_empty()) continue; // If this is byval, the loads are already explicit in the function. We just // need to rewrite the pointer values. if (IsByRef) { Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64( Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".byval.kernarg.offset"); Value *CastOffsetPtr = Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType()); Arg.replaceAllUsesWith(CastOffsetPtr); continue; } if (PointerType *PT = dyn_cast(ArgTy)) { // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing // modes on SI to know the high bits are 0 so pointer adds don't wrap. We // can't represent this with range metadata because it's only allowed for // integer types. if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && !ST.hasUsableDSOffset()) continue; // FIXME: We can replace this with equivalent alias.scope/noalias // metadata, but this appears to be a lot of work. if (Arg.hasNoAliasAttr()) continue; } auto *VT = dyn_cast(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); VectorType *V4Ty = nullptr; int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; Align AdjustedAlign = commonAlignment( KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset); Value *ArgPtr; Type *AdjustedArgTy; if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. // TODO: Update this for GFX12 which does have scalar sub-dword loads. // // Additionally widen any sub-dword load to i32 even if suitably aligned, // so that CSE between different argument loads works easily. ArgPtr = Builder.CreateConstInBoundsGEP1_64( Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, Arg.getName() + ".kernarg.offset.align.down"); AdjustedArgTy = Builder.getInt32Ty(); } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".kernarg.offset"); AdjustedArgTy = ArgTy; } if (IsV3 && Size >= 32) { V4Ty = FixedVectorType::get(VT->getElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads AdjustedArgTy = V4Ty; } LoadInst *Load = Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); if (isa(ArgTy)) { if (Arg.hasNonNullAttr()) Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); uint64_t DerefBytes = Arg.getDereferenceableBytes(); if (DerefBytes != 0) { Load->setMetadata( LLVMContext::MD_dereferenceable, MDNode::get(Ctx, MDB.createConstant( ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); } uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); if (DerefOrNullBytes != 0) { Load->setMetadata( LLVMContext::MD_dereferenceable_or_null, MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefOrNullBytes)))); } if (MaybeAlign ParamAlign = Arg.getParamAlign()) { Load->setMetadata( LLVMContext::MD_align, MDNode::get(Ctx, MDB.createConstant(ConstantInt::get( Builder.getInt64Ty(), ParamAlign->value())))); } } // TODO: Convert noalias arg to !noalias if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); IntegerType *ArgIntTy = Builder.getIntNTy(Size); Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, Arg.getName() + ".load"); Arg.replaceAllUsesWith(NewVal); } else if (IsV3) { Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef{0, 1, 2}, Arg.getName() + ".load"); Arg.replaceAllUsesWith(Shuf); } else { Load->setName(Arg.getName() + ".load"); Arg.replaceAllUsesWith(Load); } } KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); return true; } bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { auto &TPC = getAnalysis(); const TargetMachine &TM = TPC.getTM(); return lowerKernelArguments(F, TM); } INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", false, false) char AMDGPULowerKernelArguments::ID = 0; FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { return new AMDGPULowerKernelArguments(); } PreservedAnalyses AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { bool Changed = lowerKernelArguments(F, TM); if (Changed) { // TODO: Preserves a lot more. PreservedAnalyses PA; PA.preserveSet(); return PA; } return PreservedAnalyses::all(); }