//===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass lowers operations on buffer fat pointers (addrspace 7) to // operations on buffer resources (addrspace 8) and is needed for correct // codegen. // // # Background // // Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists // of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. // The buffer resource part needs to be it needs to be a "raw" buffer resource // (it must have a stride of 0 and bounds checks must be in raw buffer mode // or disabled). // // When these requirements are met, a buffer resource can be treated as a // typical (though quite wide) pointer that follows typical LLVM pointer // semantics. This allows the frontend to reason about such buffers (which are // often encountered in the context of SPIR-V kernels). // // However, because of their non-power-of-2 size, these fat pointers cannot be // present during translation to MIR (though this restriction may be lifted // during the transition to GlobalISel). Therefore, this pass is needed in order // to correctly implement these fat pointers. // // The resource intrinsics take the resource part (the address space 8 pointer) // and the offset part (the 32-bit integer) as separate arguments. In addition, // many users of these buffers manipulate the offset while leaving the resource // part alone. For these reasons, we want to typically separate the resource // and offset parts into separate variables, but combine them together when // encountering cases where this is required, such as by inserting these values // into aggretates or moving them to memory. // // Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) // %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), // i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector` becomes // `{vector, vector}` and its component parts. // // # Implementation // // This pass proceeds in three main phases: // // ## Rewriting loads and stores of p7 // // The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, // including aggregates containing such pointers, to ones that use `i160`. This // is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and // allocas and, if the loaded or stored type contains `ptr addrspace(7)`, // rewrites that type to one where the p7s are replaced by i160s, copying other // parts of aggregates as needed. In the case of a store, each pointer is // `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. // This same transformation is applied to vectors of pointers. // // Such a transformation allows the later phases of the pass to not need // to handle buffer fat pointers moving to and from memory, where we load // have to handle the incompatibility between a `{Nxp8, Nxi32}` representation // and `Nxi60` directly. Instead, that transposing action (where the vectors // of resources and vectors of offsets are concatentated before being stored to // memory) are handled through implementing `inttoptr` and `ptrtoint` only. // // Atomics operations on `ptr addrspace(7)` values are not suppported, as the // hardware does not include a 160-bit atomic. // // ## Type remapping // // We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers // to the corresponding struct type, which has a resource part and an offset // part. // // This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` // to, usually by way of `setType`ing values. Constants are handled here // because there isn't a good way to fix them up later. // // This has the downside of leaving the IR in an invalid state (for example, // the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), // but all such invalid states will be resolved by the third phase. // // Functions that don't take buffer fat pointers are modified in place. Those // that do take such pointers have their basic blocks moved to a new function // with arguments that are {ptr addrspace(8), i32} arguments and return values. // This phase also records intrinsics so that they can be remangled or deleted // later. // // // ## Splitting pointer structs // // The meat of this pass consists of defining semantics for operations that // produce or consume [vectors of] buffer fat pointers in terms of their // resource and offset parts. This is accomplished throgh the `SplitPtrStructs` // visitor. // // In the first pass through each function that is being lowered, the splitter // inserts new instructions to implement the split-structures behavior, which is // needed for correctness and performance. It records a list of "split users", // instructions that are being replaced by operations on the resource and offset // parts. // // Split users do not necessarily need to produce parts themselves ( // a `load float, ptr addrspace(7)` does not, for example), but, if they do not // generate fat buffer pointers, they must RAUW in their replacement // instructions during the initial visit. // // When these new instructions are created, they use the split parts recorded // for their initial arguments in order to generate their replacements, creating // a parallel set of instructions that does not refer to the original fat // pointer values but instead to their resource and offset components. // // Instructions, such as `extractvalue`, that produce buffer fat pointers from // sources that do not have split parts, have such parts generated using // `extractvalue`. This is also the initial handling of PHI nodes, which // are then cleaned up. // // ### Conditionals // // PHI nodes are initially given resource parts via `extractvalue`. However, // this is not an efficient rewrite of such nodes, as, in most cases, the // resource part in a conditional or loop remains constant throughout the loop // and only the offset varies. Failing to optimize away these constant resources // would cause additional registers to be sent around loops and might lead to // waterfall loops being generated for buffer operations due to the // "non-uniform" resource argument. // // Therefore, after all instructions have been visited, the pointer splitter // post-processes all encountered conditionals. Given a PHI node or select, // getPossibleRsrcRoots() collects all values that the resource parts of that // conditional's input could come from as well as collecting all conditional // instructions encountered during the search. If, after filtering out the // initial node itself, the set of encountered conditionals is a subset of the // potential roots and there is a single potential resource that isn't in the // conditional set, that value is the only possible value the resource argument // could have throughout the control flow. // // If that condition is met, then a PHI node can have its resource part changed // to the singleton value and then be replaced by a PHI on the offsets. // Otherwise, each PHI node is split into two, one for the resource part and one // for the offset part, which replace the temporary `extractvalue` instructions // that were added during the first pass. // // Similar logic applies to `select`, where // `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` // can be split into `%z.rsrc = %x.rsrc` and // `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` // if both `%x` and `%y` have the same resource part, but two `select` // operations will be needed if they do not. // // ### Final processing // // After conditionals have been cleaned up, the IR for each function is // rewritten to remove all the old instructions that have been split up. // // Any instruction that used to produce a buffer fat pointer (and therefore now // produces a resource-and-offset struct after type remapping) is // replaced as follows: // 1. All debug value annotations are cloned to reflect that the resource part // and offset parts are computed separately and constitute different // fragments of the underlying source language variable. // 2. All uses that were themselves split are replaced by a `poison` of the // struct type, as they will themselves be erased soon. This rule, combined // with debug handling, should leave the use lists of split instructions // empty in almost all cases. // 3. If a user of the original struct-valued result remains, the structure // needed for the new types to work is constructed out of the newly-defined // parts, and the original instruction is replaced by this structure // before being erased. Instructions requiring this construction include // `ret` and `insertvalue`. // // # Consequences // // This pass does not alter the CFG. // // Alias analysis information will become coarser, as the LLVM alias analyzer // cannot handle the buffer intrinsics. Specifically, while we can determine // that the following two loads do not alias: // ``` // %y = getelementptr i32, ptr addrspace(7) %x, i32 1 // %a = load i32, ptr addrspace(7) %x // %b = load i32, ptr addrspace(7) %y // ``` // we cannot (except through some code that runs during scheduling) determine // that the rewritten loads below do not alias. // ``` // %y.off = add i32 %x.off, 1 // %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 // %x.off, ...) // %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) // %x.rsrc, i32 %y.off, ...) // ``` // However, existing alias information is preserved. //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "GCNSubtarget.h" #include "SIDefines.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/AttributeMask.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" using namespace llvm; static constexpr unsigned BufferOffsetWidth = 32; namespace { /// Recursively replace instances of ptr addrspace(7) and vector with some other type as defined by the relevant subclass. class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { DenseMap Map; Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl &Seen); protected: virtual Type *remapScalar(PointerType *PT) = 0; virtual Type *remapVector(VectorType *VT) = 0; const DataLayout &DL; public: BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {} Type *remapType(Type *SrcTy) override; void clear() { Map.clear(); } }; /// Remap ptr addrspace(7) to i160 and vector to /// vector in order to correctly handling loading/storing these values /// from memory. class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; protected: Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); } Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); } }; /// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset /// parts of the pointer) so that we can easily rewrite operations on these /// values that aren't loading them from or storing them to memory. class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; protected: Type *remapScalar(PointerType *PT) override; Type *remapVector(VectorType *VT) override; }; } // namespace // This code is adapted from the type remapper in lib/Linker/IRMover.cpp Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( Type *Ty, SmallPtrSetImpl &Seen) { Type **Entry = &Map[Ty]; if (*Entry) return *Entry; if (auto *PT = dyn_cast(Ty)) { if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { return *Entry = remapScalar(PT); } } if (auto *VT = dyn_cast(Ty)) { auto *PT = dyn_cast(VT->getElementType()); if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { return *Entry = remapVector(VT); } return *Entry = Ty; } // Whether the type is one that is structurally uniqued - that is, if it is // not a named struct (the only kind of type where multiple structurally // identical types that have a distinct `Type*`) StructType *TyAsStruct = dyn_cast(Ty); bool IsUniqued = !TyAsStruct || TyAsStruct->isLiteral(); // Base case for ints, floats, opaque pointers, and so on, which don't // require recursion. if (Ty->getNumContainedTypes() == 0 && IsUniqued) return *Entry = Ty; if (!IsUniqued) { // Create a dummy type for recursion purposes. if (!Seen.insert(TyAsStruct).second) { StructType *Placeholder = StructType::create(Ty->getContext()); return *Entry = Placeholder; } } bool Changed = false; SmallVector ElementTypes(Ty->getNumContainedTypes(), nullptr); for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) { Type *OldElem = Ty->getContainedType(I); Type *NewElem = remapTypeImpl(OldElem, Seen); ElementTypes[I] = NewElem; Changed |= (OldElem != NewElem); } // Recursive calls to remapTypeImpl() may have invalidated pointer. Entry = &Map[Ty]; if (!Changed) { return *Entry = Ty; } if (auto *ArrTy = dyn_cast(Ty)) return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements()); if (auto *FnTy = dyn_cast(Ty)) return *Entry = FunctionType::get(ElementTypes[0], ArrayRef(ElementTypes).slice(1), FnTy->isVarArg()); if (auto *STy = dyn_cast(Ty)) { // Genuine opaque types don't have a remapping. if (STy->isOpaque()) return *Entry = Ty; bool IsPacked = STy->isPacked(); if (IsUniqued) return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); SmallString<16> Name(STy->getName()); STy->setName(""); Type **RecursionEntry = &Map[Ty]; if (*RecursionEntry) { auto *Placeholder = cast(*RecursionEntry); Placeholder->setBody(ElementTypes, IsPacked); Placeholder->setName(Name); return *Entry = Placeholder; } return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name, IsPacked); } llvm_unreachable("Unknown type of type that contains elements"); } Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { SmallPtrSet Visited; return remapTypeImpl(SrcTy, Visited); } Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { LLVMContext &Ctx = PT->getContext(); return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), IntegerType::get(Ctx, BufferOffsetWidth)); } Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { ElementCount EC = VT->getElementCount(); LLVMContext &Ctx = VT->getContext(); Type *RsrcVec = VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC); Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC); return StructType::get(RsrcVec, OffVec); } static bool isBufferFatPtrOrVector(Type *Ty) { if (auto *PT = dyn_cast(Ty->getScalarType())) return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER; return false; } // True if the type is {ptr addrspace(8), i32} or a struct containing vectors of // those types. Used to quickly skip instructions we don't need to process. static bool isSplitFatPtr(Type *Ty) { auto *ST = dyn_cast(Ty); if (!ST) return false; if (!ST->isLiteral() || ST->getNumElements() != 2) return false; auto *MaybeRsrc = dyn_cast(ST->getElementType(0)->getScalarType()); auto *MaybeOff = dyn_cast(ST->getElementType(1)->getScalarType()); return MaybeRsrc && MaybeOff && MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE && MaybeOff->getBitWidth() == BufferOffsetWidth; } // True if the result type or any argument types are buffer fat pointers. static bool isBufferFatPtrConst(Constant *C) { Type *T = C->getType(); return isBufferFatPtrOrVector(T) || any_of(C->operands(), [](const Use &U) { return isBufferFatPtrOrVector(U.get()->getType()); }); } namespace { /// Convert [vectors of] buffer fat pointers to integers when they are read from /// or stored to memory. This ensures that these pointers will have the same /// memory layout as before they are lowered, even though they will no longer /// have their previous layout in registers/in the program (they'll be broken /// down into resource and offset parts). This has the downside of imposing /// marshalling costs when reading or storing these values, but since placing /// such pointers into memory is an uncommon operation at best, we feel that /// this cost is acceptable for better performance in the common case. class StoreFatPtrsAsIntsVisitor : public InstVisitor { BufferFatPtrToIntTypeMap *TypeMap; ValueToValueMapTy ConvertedForStore; IRBuilder<> IRB; // Convert all the buffer fat pointers within the input value to inttegers // so that it can be stored in memory. Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); // Convert all the i160s that need to be buffer fat pointers (as specified) // by the To type) into those pointers to preserve the semantics of the rest // of the program. Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); public: StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx) : TypeMap(TypeMap), IRB(Ctx) {} bool processFunction(Function &F); bool visitInstruction(Instruction &I) { return false; } bool visitAllocaInst(AllocaInst &I); bool visitLoadInst(LoadInst &LI); bool visitStoreInst(StoreInst &SI); bool visitGetElementPtrInst(GetElementPtrInst &I); }; } // namespace Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name) { if (From == To) return V; ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); if (Find != ConvertedForStore.end()) return Find->second; if (isBufferFatPtrOrVector(From)) { Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int"); ConvertedForStore[V] = Cast; return Cast; } if (From->getNumContainedTypes() == 0) return V; // Structs, arrays, and other compound types. Value *Ret = PoisonValue::get(To); if (auto *AT = dyn_cast(From)) { Type *FromPart = AT->getArrayElementType(); Type *ToPart = cast(To)->getElementType(); for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { Value *Field = IRB.CreateExtractValue(V, I); Value *NewField = fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I)); Ret = IRB.CreateInsertValue(Ret, NewField, I); } } else { for (auto [Idx, FromPart, ToPart] : enumerate(From->subtypes(), To->subtypes())) { Value *Field = IRB.CreateExtractValue(V, Idx); Value *NewField = fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx)); Ret = IRB.CreateInsertValue(Ret, NewField, Idx); } } ConvertedForStore[V] = Ret; return Ret; } Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name) { if (From == To) return V; if (isBufferFatPtrOrVector(To)) { Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr"); return Cast; } if (From->getNumContainedTypes() == 0) return V; // Structs, arrays, and other compound types. Value *Ret = PoisonValue::get(To); if (auto *AT = dyn_cast(From)) { Type *FromPart = AT->getArrayElementType(); Type *ToPart = cast(To)->getElementType(); for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { Value *Field = IRB.CreateExtractValue(V, I); Value *NewField = intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I)); Ret = IRB.CreateInsertValue(Ret, NewField, I); } } else { for (auto [Idx, FromPart, ToPart] : enumerate(From->subtypes(), To->subtypes())) { Value *Field = IRB.CreateExtractValue(V, Idx); Value *NewField = intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx)); Ret = IRB.CreateInsertValue(Ret, NewField, Idx); } } return Ret; } bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { bool Changed = false; // The visitors will mutate GEPs and allocas, but will push loads and stores // to the worklist to avoid invalidation. for (Instruction &I : make_early_inc_range(instructions(F))) { Changed |= visit(I); } ConvertedForStore.clear(); return Changed; } bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { Type *Ty = I.getAllocatedType(); Type *NewTy = TypeMap->remapType(Ty); if (Ty == NewTy) return false; I.setAllocatedType(NewTy); return true; } bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { Type *Ty = I.getSourceElementType(); Type *NewTy = TypeMap->remapType(Ty); if (Ty == NewTy) return false; // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so // make sure GEPs don't have different semantics with the new type. I.setSourceElementType(NewTy); I.setResultElementType(TypeMap->remapType(I.getResultElementType())); return true; } bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { Type *Ty = LI.getType(); Type *IntTy = TypeMap->remapType(Ty); if (Ty == IntTy) return false; IRB.SetInsertPoint(&LI); auto *NLI = cast(LI.clone()); NLI->mutateType(IntTy); NLI = IRB.Insert(NLI); copyMetadataForLoad(*NLI, LI); NLI->takeName(&LI); Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName()); LI.replaceAllUsesWith(CastBack); LI.eraseFromParent(); return true; } bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { Value *V = SI.getValueOperand(); Type *Ty = V->getType(); Type *IntTy = TypeMap->remapType(Ty); if (Ty == IntTy) return false; IRB.SetInsertPoint(&SI); Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); for (auto *Dbg : at::getAssignmentMarkers(&SI)) Dbg->setValue(IntV); SI.setOperand(0, IntV); return true; } /// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered /// buffer fat pointer constant. static std::pair splitLoweredFatBufferConst(Constant *C) { assert(isSplitFatPtr(C->getType()) && "Not a split fat buffer pointer"); return std::make_pair(C->getAggregateElement(0u), C->getAggregateElement(1u)); } namespace { /// Handle the remapping of ptr addrspace(7) constants. class FatPtrConstMaterializer final : public ValueMaterializer { BufferFatPtrToStructTypeMap *TypeMap; // An internal mapper that is used to recurse into the arguments of constants. // While the documentation for `ValueMapper` specifies not to use it // recursively, examination of the logic in mapValue() shows that it can // safely be used recursively when handling constants, like it does in its own // logic. ValueMapper InternalMapper; Constant *materializeBufferFatPtrConst(Constant *C); public: // UnderlyingMap is the value map this materializer will be filling. FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, ValueToValueMapTy &UnderlyingMap) : TypeMap(TypeMap), InternalMapper(UnderlyingMap, RF_None, TypeMap, this) {} virtual ~FatPtrConstMaterializer() = default; Value *materialize(Value *V) override; }; } // namespace Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { Type *SrcTy = C->getType(); auto *NewTy = dyn_cast(TypeMap->remapType(SrcTy)); if (C->isNullValue()) return ConstantAggregateZero::getNullValue(NewTy); if (isa(C)) { return ConstantStruct::get(NewTy, {PoisonValue::get(NewTy->getElementType(0)), PoisonValue::get(NewTy->getElementType(1))}); } if (isa(C)) { return ConstantStruct::get(NewTy, {UndefValue::get(NewTy->getElementType(0)), UndefValue::get(NewTy->getElementType(1))}); } if (auto *VC = dyn_cast(C)) { if (Constant *S = VC->getSplatValue()) { Constant *NewS = InternalMapper.mapConstant(*S); if (!NewS) return nullptr; auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS); auto EC = VC->getType()->getElementCount(); return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc), ConstantVector::getSplat(EC, Off)}); } SmallVector Rsrcs; SmallVector Offs; for (Value *Op : VC->operand_values()) { auto *NewOp = dyn_cast_or_null(InternalMapper.mapValue(*Op)); if (!NewOp) return nullptr; auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp); Rsrcs.push_back(Rsrc); Offs.push_back(Off); } Constant *RsrcVec = ConstantVector::get(Rsrcs); Constant *OffVec = ConstantVector::get(Offs); return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); } if (isa(C)) report_fatal_error("Global values containing ptr addrspace(7) (buffer " "fat pointer) values are not supported"); if (isa(C)) report_fatal_error("Constant exprs containing ptr addrspace(7) (buffer " "fat pointer) values should have been expanded earlier"); return nullptr; } Value *FatPtrConstMaterializer::materialize(Value *V) { Constant *C = dyn_cast(V); if (!C) return nullptr; // Structs and other types that happen to contain fat pointers get remapped // by the mapValue() logic. if (!isBufferFatPtrConst(C)) return nullptr; return materializeBufferFatPtrConst(C); } using PtrParts = std::pair; namespace { // The visitor returns the resource and offset parts for an instruction if they // can be computed, or (nullptr, nullptr) for cases that don't have a meaningful // value mapping. class SplitPtrStructs : public InstVisitor { ValueToValueMapTy RsrcParts; ValueToValueMapTy OffParts; // Track instructions that have been rewritten into a user of the component // parts of their ptr addrspace(7) input. Instructions that produced // ptr addrspace(7) parts should **not** be RAUW'd before being added to this // set, as that replacement will be handled in a post-visit step. However, // instructions that yield values that aren't fat pointers (ex. ptrtoint) // should RAUW themselves with new instructions that use the split parts // of their arguments during processing. DenseSet SplitUsers; // Nodes that need a second look once we've computed the parts for all other // instructions to see if, for example, we really need to phi on the resource // part. SmallVector Conditionals; // Temporary instructions produced while lowering conditionals that should be // killed. SmallVector ConditionalTemps; // Subtarget info, needed for determining what cache control bits to set. const TargetMachine *TM; const GCNSubtarget *ST = nullptr; IRBuilder<> IRB; // Copy metadata between instructions if applicable. void copyMetadata(Value *Dest, Value *Src); // Get the resource and offset parts of the value V, inserting appropriate // extractvalue calls if needed. PtrParts getPtrParts(Value *V); // Given an instruction that could produce multiple resource parts (a PHI or // select), collect the set of possible instructions that could have provided // its resource parts that it could have (the `Roots`) and the set of // conditional instructions visited during the search (`Seen`). If, after // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset // of `Roots` and `Roots - Seen` contains one element, the resource part of // that element can replace the resource part of all other elements in `Seen`. void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl &Roots, SmallPtrSetImpl &Seen); void processConditionals(); // If an instruction hav been split into resource and offset parts, // delete that instruction. If any of its uses have not themselves been split // into parts (for example, an insertvalue), construct the structure // that the type rewrites declared should be produced by the dying instruction // and use that. // Also, kill the temporary extractvalue operations produced by the two-stage // lowering of PHIs and conditionals. void killAndReplaceSplitInstructions(SmallVectorImpl &Origs); void setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx); void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, Align Alignment, AtomicOrdering Order, bool IsVolatile, SyncScope::ID SSID); public: SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) : TM(TM), IRB(Ctx) {} void processFunction(Function &F); PtrParts visitInstruction(Instruction &I); PtrParts visitLoadInst(LoadInst &LI); PtrParts visitStoreInst(StoreInst &SI); PtrParts visitAtomicRMWInst(AtomicRMWInst &AI); PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); PtrParts visitPtrToIntInst(PtrToIntInst &PI); PtrParts visitIntToPtrInst(IntToPtrInst &IP); PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); PtrParts visitICmpInst(ICmpInst &Cmp); PtrParts visitFreezeInst(FreezeInst &I); PtrParts visitExtractElementInst(ExtractElementInst &I); PtrParts visitInsertElementInst(InsertElementInst &I); PtrParts visitShuffleVectorInst(ShuffleVectorInst &I); PtrParts visitPHINode(PHINode &PHI); PtrParts visitSelectInst(SelectInst &SI); PtrParts visitIntrinsicInst(IntrinsicInst &II); }; } // namespace void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { auto *DestI = dyn_cast(Dest); auto *SrcI = dyn_cast(Src); if (!DestI || !SrcI) return; DestI->copyMetadata(*SrcI); } PtrParts SplitPtrStructs::getPtrParts(Value *V) { assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts " "of something that wasn't rewritten"); auto *RsrcEntry = &RsrcParts[V]; auto *OffEntry = &OffParts[V]; if (*RsrcEntry && *OffEntry) return {*RsrcEntry, *OffEntry}; if (auto *C = dyn_cast(V)) { auto [Rsrc, Off] = splitLoweredFatBufferConst(C); return {*RsrcEntry = Rsrc, *OffEntry = Off}; } IRBuilder<>::InsertPointGuard Guard(IRB); if (auto *I = dyn_cast(V)) { LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n"); auto [Rsrc, Off] = visit(*I); if (Rsrc && Off) return {*RsrcEntry = Rsrc, *OffEntry = Off}; // We'll be creating the new values after the relevant instruction. // This instruction generates a value and so isn't a terminator. IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); IRB.SetCurrentDebugLocation(I->getDebugLoc()); } else if (auto *A = dyn_cast(V)) { IRB.SetInsertPointPastAllocas(A->getParent()); IRB.SetCurrentDebugLocation(DebugLoc()); } Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc"); Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off"); return {*RsrcEntry = Rsrc, *OffEntry = Off}; } /// Returns the instruction that defines the resource part of the value V. /// Note that this is not getUnderlyingObject(), since that looks through /// operations like ptrmask which might modify the resource part. /// /// We can limit ourselves to just looking through GEPs followed by looking /// through addrspacecasts because only those two operations preserve the /// resource part, and because operations on an `addrspace(8)` (which is the /// legal input to this addrspacecast) would produce a different resource part. static Value *rsrcPartRoot(Value *V) { while (auto *GEP = dyn_cast(V)) V = GEP->getPointerOperand(); while (auto *ASC = dyn_cast(V)) V = ASC->getPointerOperand(); return V; } void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl &Roots, SmallPtrSetImpl &Seen) { if (auto *PHI = dyn_cast(I)) { if (!Seen.insert(I).second) return; for (Value *In : PHI->incoming_values()) { In = rsrcPartRoot(In); Roots.insert(In); if (isa(In)) getPossibleRsrcRoots(cast(In), Roots, Seen); } } else if (auto *SI = dyn_cast(I)) { if (!Seen.insert(SI).second) return; Value *TrueVal = rsrcPartRoot(SI->getTrueValue()); Value *FalseVal = rsrcPartRoot(SI->getFalseValue()); Roots.insert(TrueVal); Roots.insert(FalseVal); if (isa(TrueVal)) getPossibleRsrcRoots(cast(TrueVal), Roots, Seen); if (isa(FalseVal)) getPossibleRsrcRoots(cast(FalseVal), Roots, Seen); } else { llvm_unreachable("getPossibleRsrcParts() only works on phi and select"); } } void SplitPtrStructs::processConditionals() { SmallDenseMap FoundRsrcs; SmallPtrSet Roots; SmallPtrSet Seen; for (Instruction *I : Conditionals) { // These have to exist by now because we've visited these nodes. Value *Rsrc = RsrcParts[I]; Value *Off = OffParts[I]; assert(Rsrc && Off && "must have visited conditionals by now"); std::optional MaybeRsrc; auto MaybeFoundRsrc = FoundRsrcs.find(I); if (MaybeFoundRsrc != FoundRsrcs.end()) { MaybeRsrc = MaybeFoundRsrc->second; } else { IRBuilder<>::InsertPointGuard Guard(IRB); Roots.clear(); Seen.clear(); getPossibleRsrcRoots(I, Roots, Seen); LLVM_DEBUG(dbgs() << "Processing conditional: " << *I << "\n"); #ifndef NDEBUG for (Value *V : Roots) LLVM_DEBUG(dbgs() << "Root: " << *V << "\n"); for (Value *V : Seen) LLVM_DEBUG(dbgs() << "Seen: " << *V << "\n"); #endif // If we are our own possible root, then we shouldn't block our // replacement with a valid incoming value. Roots.erase(I); // We don't want to block the optimization for conditionals that don't // refer to themselves but did see themselves during the traversal. Seen.erase(I); if (set_is_subset(Seen, Roots)) { auto Diff = set_difference(Roots, Seen); if (Diff.size() == 1) { Value *RootVal = *Diff.begin(); // Handle the case where previous loops already looked through // an addrspacecast. if (isSplitFatPtr(RootVal->getType())) MaybeRsrc = std::get<0>(getPtrParts(RootVal)); else MaybeRsrc = RootVal; } } } if (auto *PHI = dyn_cast(I)) { Value *NewRsrc; StructType *PHITy = cast(PHI->getType()); IRB.SetInsertPoint(*PHI->getInsertionPointAfterDef()); IRB.SetCurrentDebugLocation(PHI->getDebugLoc()); if (MaybeRsrc) { NewRsrc = *MaybeRsrc; } else { Type *RsrcTy = PHITy->getElementType(0); auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues()); RsrcPHI->takeName(Rsrc); for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { Value *VRsrc = std::get<0>(getPtrParts(V)); RsrcPHI->addIncoming(VRsrc, BB); } copyMetadata(RsrcPHI, PHI); NewRsrc = RsrcPHI; } Type *OffTy = PHITy->getElementType(1); auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues()); NewOff->takeName(Off); for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { assert(OffParts.count(V) && "An offset part had to be created by now"); Value *VOff = std::get<1>(getPtrParts(V)); NewOff->addIncoming(VOff, BB); } copyMetadata(NewOff, PHI); // Note: We don't eraseFromParent() the temporaries because we don't want // to put the corrections maps in an inconstent state. That'll be handed // during the rest of the killing. Also, `ValueToValueMapTy` guarantees // that references in that map will be updated as well. ConditionalTemps.push_back(cast(Rsrc)); ConditionalTemps.push_back(cast(Off)); Rsrc->replaceAllUsesWith(NewRsrc); Off->replaceAllUsesWith(NewOff); // Save on recomputing the cycle traversals in known-root cases. if (MaybeRsrc) for (Value *V : Seen) FoundRsrcs[cast(V)] = NewRsrc; } else if (isa(I)) { if (MaybeRsrc) { ConditionalTemps.push_back(cast(Rsrc)); Rsrc->replaceAllUsesWith(*MaybeRsrc); for (Value *V : Seen) FoundRsrcs[cast(V)] = *MaybeRsrc; } } else { llvm_unreachable("Only PHIs and selects go in the conditionals list"); } } } void SplitPtrStructs::killAndReplaceSplitInstructions( SmallVectorImpl &Origs) { for (Instruction *I : ConditionalTemps) I->eraseFromParent(); for (Instruction *I : Origs) { if (!SplitUsers.contains(I)) continue; SmallVector Dbgs; findDbgValues(Dbgs, I); for (auto *Dbg : Dbgs) { IRB.SetInsertPoint(Dbg); auto &DL = I->getDataLayout(); assert(isSplitFatPtr(I->getType()) && "We should've RAUW'd away loads, stores, etc. at this point"); auto *OffDbg = cast(Dbg->clone()); copyMetadata(OffDbg, Dbg); auto [Rsrc, Off] = getPtrParts(I); int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); int64_t OffSz = DL.getTypeSizeInBits(Off->getType()); std::optional RsrcExpr = DIExpression::createFragmentExpression(Dbg->getExpression(), 0, RsrcSz); std::optional OffExpr = DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz, OffSz); if (OffExpr) { OffDbg->setExpression(*OffExpr); OffDbg->replaceVariableLocationOp(I, Off); IRB.Insert(OffDbg); } else { OffDbg->deleteValue(); } if (RsrcExpr) { Dbg->setExpression(*RsrcExpr); Dbg->replaceVariableLocationOp(I, Rsrc); } else { Dbg->replaceVariableLocationOp(I, UndefValue::get(I->getType())); } } Value *Poison = PoisonValue::get(I->getType()); I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool { if (const auto *UI = dyn_cast(U.getUser())) return SplitUsers.contains(UI); return false; }); if (I->use_empty()) { I->eraseFromParent(); continue; } IRB.SetInsertPoint(*I->getInsertionPointAfterDef()); IRB.SetCurrentDebugLocation(I->getDebugLoc()); auto [Rsrc, Off] = getPtrParts(I); Value *Struct = PoisonValue::get(I->getType()); Struct = IRB.CreateInsertValue(Struct, Rsrc, 0); Struct = IRB.CreateInsertValue(Struct, Off, 1); copyMetadata(Struct, I); Struct->takeName(I); I->replaceAllUsesWith(Struct); I->eraseFromParent(); } } void SplitPtrStructs::setAlign(CallInst *Intr, Align A, unsigned RsrcArgIdx) { LLVMContext &Ctx = Intr->getContext(); Intr->addParamAttr(RsrcArgIdx, Attribute::getWithAlignment(Ctx, A)); } void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID) { switch (Order) { case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: IRB.CreateFence(AtomicOrdering::Release, SSID); break; default: break; } } void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID) { switch (Order) { case AtomicOrdering::Acquire: case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: IRB.CreateFence(AtomicOrdering::Acquire, SSID); break; default: break; } } Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, Align Alignment, AtomicOrdering Order, bool IsVolatile, SyncScope::ID SSID) { IRB.SetInsertPoint(I); auto [Rsrc, Off] = getPtrParts(Ptr); SmallVector Args; if (Arg) Args.push_back(Arg); Args.push_back(Rsrc); Args.push_back(Off); insertPreMemOpFence(Order, SSID); // soffset is always 0 for these cases, where we always want any offset to be // part of bounds checking and we don't know which parts of the GEPs is // uniform. Args.push_back(IRB.getInt32(0)); uint32_t Aux = 0; bool IsInvariant = (isa(I) && I->getMetadata(LLVMContext::MD_invariant_load)); bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal); // Atomic loads and stores need glc, atomic read-modify-write doesn't. bool IsOneWayAtomic = !isa(I) && Order != AtomicOrdering::NotAtomic; if (IsOneWayAtomic) Aux |= AMDGPU::CPol::GLC; if (IsNonTemporal && !IsInvariant) Aux |= AMDGPU::CPol::SLC; if (isa(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10) Aux |= (Aux & AMDGPU::CPol::GLC ? AMDGPU::CPol::DLC : 0); if (IsVolatile) Aux |= AMDGPU::CPol::VOLATILE; Args.push_back(IRB.getInt32(Aux)); Intrinsic::ID IID = Intrinsic::not_intrinsic; if (isa(I)) IID = Order == AtomicOrdering::NotAtomic ? Intrinsic::amdgcn_raw_ptr_buffer_load : Intrinsic::amdgcn_raw_ptr_atomic_buffer_load; else if (isa(I)) IID = Intrinsic::amdgcn_raw_ptr_buffer_store; else if (auto *RMW = dyn_cast(I)) { switch (RMW->getOperation()) { case AtomicRMWInst::Xchg: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap; break; case AtomicRMWInst::Add: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add; break; case AtomicRMWInst::Sub: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub; break; case AtomicRMWInst::And: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and; break; case AtomicRMWInst::Or: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or; break; case AtomicRMWInst::Xor: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor; break; case AtomicRMWInst::Max: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax; break; case AtomicRMWInst::Min: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin; break; case AtomicRMWInst::UMax: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax; break; case AtomicRMWInst::UMin: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin; break; case AtomicRMWInst::FAdd: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd; break; case AtomicRMWInst::FMax: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax; break; case AtomicRMWInst::FMin: IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; break; case AtomicRMWInst::FSub: { report_fatal_error("atomic floating point subtraction not supported for " "buffer resources and should've been expanded away"); break; } case AtomicRMWInst::Nand: report_fatal_error("atomic nand not supported for buffer resources and " "should've been expanded away"); break; case AtomicRMWInst::UIncWrap: case AtomicRMWInst::UDecWrap: report_fatal_error("wrapping increment/decrement not supported for " "buffer resources and should've ben expanded away"); break; case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Not sure how we got a bad binop"); } } auto *Call = IRB.CreateIntrinsic(IID, Ty, Args); copyMetadata(Call, I); setAlign(Call, Alignment, Arg ? 1 : 0); Call->takeName(I); insertPostMemOpFence(Order, SSID); // The "no moving p7 directly" rewrites ensure that this load or store won't // itself need to be split into parts. SplitUsers.insert(I); I->replaceAllUsesWith(Call); return Call; } PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { return {nullptr, nullptr}; } PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { if (!isSplitFatPtr(LI.getPointerOperandType())) return {nullptr, nullptr}; handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(), LI.getAlign(), LI.getOrdering(), LI.isVolatile(), LI.getSyncScopeID()); return {nullptr, nullptr}; } PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { if (!isSplitFatPtr(SI.getPointerOperandType())) return {nullptr, nullptr}; Value *Arg = SI.getValueOperand(); handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(), SI.getAlign(), SI.getOrdering(), SI.isVolatile(), SI.getSyncScopeID()); return {nullptr, nullptr}; } PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { if (!isSplitFatPtr(AI.getPointerOperand()->getType())) return {nullptr, nullptr}; Value *Arg = AI.getValOperand(); handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(), AI.getAlign(), AI.getOrdering(), AI.isVolatile(), AI.getSyncScopeID()); return {nullptr, nullptr}; } // Unlike load, store, and RMW, cmpxchg needs special handling to account // for the boolean argument. PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { Value *Ptr = AI.getPointerOperand(); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&AI); Type *Ty = AI.getNewValOperand()->getType(); AtomicOrdering Order = AI.getMergedOrdering(); SyncScope::ID SSID = AI.getSyncScopeID(); bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal); auto [Rsrc, Off] = getPtrParts(Ptr); insertPreMemOpFence(Order, SSID); uint32_t Aux = 0; if (IsNonTemporal) Aux |= AMDGPU::CPol::SLC; if (AI.isVolatile()) Aux |= AMDGPU::CPol::VOLATILE; auto *Call = IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty, {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc, Off, IRB.getInt32(0), IRB.getInt32(Aux)}); copyMetadata(Call, &AI); setAlign(Call, AI.getAlign(), 2); Call->takeName(&AI); insertPostMemOpFence(Order, SSID); Value *Res = PoisonValue::get(AI.getType()); Res = IRB.CreateInsertValue(Res, Call, 0); if (!AI.isWeak()) { Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand()); Res = IRB.CreateInsertValue(Res, Succeeded, 1); } SplitUsers.insert(&AI); AI.replaceAllUsesWith(Res); return {nullptr, nullptr}; } PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { using namespace llvm::PatternMatch; Value *Ptr = GEP.getPointerOperand(); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&GEP); auto [Rsrc, Off] = getPtrParts(Ptr); const DataLayout &DL = GEP.getDataLayout(); bool InBounds = GEP.isInBounds(); // In order to call emitGEPOffset() and thus not have to reimplement it, // we need the GEP result to have ptr addrspace(7) type. Type *FatPtrTy = IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER); if (auto *VT = dyn_cast(Off->getType())) FatPtrTy = VectorType::get(FatPtrTy, VT->getElementCount()); GEP.mutateType(FatPtrTy); Value *OffAccum = emitGEPOffset(&IRB, DL, &GEP); GEP.mutateType(Ptr->getType()); if (match(OffAccum, m_Zero())) { // Constant-zero offset SplitUsers.insert(&GEP); return {Rsrc, Off}; } bool HasNonNegativeOff = false; if (auto *CI = dyn_cast(OffAccum)) { HasNonNegativeOff = !CI->isNegative(); } Value *NewOff; if (match(Off, m_Zero())) { NewOff = OffAccum; } else { NewOff = IRB.CreateAdd(Off, OffAccum, "", /*hasNUW=*/InBounds && HasNonNegativeOff, /*hasNSW=*/false); } copyMetadata(NewOff, &GEP); NewOff->takeName(&GEP); SplitUsers.insert(&GEP); return {Rsrc, NewOff}; } PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { Value *Ptr = PI.getPointerOperand(); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&PI); Type *ResTy = PI.getType(); unsigned Width = ResTy->getScalarSizeInBits(); auto [Rsrc, Off] = getPtrParts(Ptr); const DataLayout &DL = PI.getDataLayout(); unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); Value *Res; if (Width <= BufferOffsetWidth) { Res = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); } else { Value *RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); Value *Shl = IRB.CreateShl( RsrcInt, ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "", Width >= FatPtrWidth, Width > FatPtrWidth); Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); Res = IRB.CreateOr(Shl, OffCast); } copyMetadata(Res, &PI); Res->takeName(&PI); SplitUsers.insert(&PI); PI.replaceAllUsesWith(Res); return {nullptr, nullptr}; } PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { if (!isSplitFatPtr(IP.getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&IP); const DataLayout &DL = IP.getDataLayout(); unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); Value *Int = IP.getOperand(0); Type *IntTy = Int->getType(); Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth); unsigned Width = IntTy->getScalarSizeInBits(); auto *RetTy = cast(IP.getType()); Type *RsrcTy = RetTy->getElementType(0); Type *OffTy = RetTy->getElementType(1); Value *RsrcPart = IRB.CreateLShr( Int, ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth))); Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false); Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc"); Value *Off = IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off"); copyMetadata(Rsrc, &IP); SplitUsers.insert(&IP); return {Rsrc, Off}; } PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { if (!isSplitFatPtr(I.getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); Value *In = I.getPointerOperand(); // No-op casts preserve parts if (In->getType() == I.getType()) { auto [Rsrc, Off] = getPtrParts(In); SplitUsers.insert(&I); return {Rsrc, Off}; } if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE) report_fatal_error("Only buffer resources (addrspace 8) can be cast to " "buffer fat pointers (addrspace 7)"); Type *OffTy = cast(I.getType())->getElementType(1); Value *ZeroOff = Constant::getNullValue(OffTy); SplitUsers.insert(&I); return {In, ZeroOff}; } PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { Value *Lhs = Cmp.getOperand(0); if (!isSplitFatPtr(Lhs->getType())) return {nullptr, nullptr}; Value *Rhs = Cmp.getOperand(1); IRB.SetInsertPoint(&Cmp); ICmpInst::Predicate Pred = Cmp.getPredicate(); assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && "Pointer comparison is only equal or unequal"); auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); Value *RsrcCmp = IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); copyMetadata(RsrcCmp, &Cmp); Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); copyMetadata(OffCmp, &Cmp); Value *Res = nullptr; if (Pred == ICmpInst::ICMP_EQ) Res = IRB.CreateAnd(RsrcCmp, OffCmp); else if (Pred == ICmpInst::ICMP_NE) Res = IRB.CreateOr(RsrcCmp, OffCmp); copyMetadata(Res, &Cmp); Res->takeName(&Cmp); SplitUsers.insert(&Cmp); Cmp.replaceAllUsesWith(Res); return {nullptr, nullptr}; } PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { if (!isSplitFatPtr(I.getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); auto [Rsrc, Off] = getPtrParts(I.getOperand(0)); Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc"); copyMetadata(RsrcRes, &I); Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off"); copyMetadata(OffRes, &I); SplitUsers.insert(&I); return {RsrcRes, OffRes}; } PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { if (!isSplitFatPtr(I.getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); Value *Vec = I.getVectorOperand(); Value *Idx = I.getIndexOperand(); auto [Rsrc, Off] = getPtrParts(Vec); Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc"); copyMetadata(RsrcRes, &I); Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off"); copyMetadata(OffRes, &I); SplitUsers.insert(&I); return {RsrcRes, OffRes}; } PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { // The mutated instructions temporarily don't return vectors, and so // we need the generic getType() here to avoid crashes. if (!isSplitFatPtr(cast(I).getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); Value *Vec = I.getOperand(0); Value *Elem = I.getOperand(1); Value *Idx = I.getOperand(2); auto [VecRsrc, VecOff] = getPtrParts(Vec); auto [ElemRsrc, ElemOff] = getPtrParts(Elem); Value *RsrcRes = IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc"); copyMetadata(RsrcRes, &I); Value *OffRes = IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off"); copyMetadata(OffRes, &I); SplitUsers.insert(&I); return {RsrcRes, OffRes}; } PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { // Cast is needed for the same reason as insertelement's. if (!isSplitFatPtr(cast(I).getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); Value *V1 = I.getOperand(0); Value *V2 = I.getOperand(1); ArrayRef Mask = I.getShuffleMask(); auto [V1Rsrc, V1Off] = getPtrParts(V1); auto [V2Rsrc, V2Off] = getPtrParts(V2); Value *RsrcRes = IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc"); copyMetadata(RsrcRes, &I); Value *OffRes = IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off"); copyMetadata(OffRes, &I); SplitUsers.insert(&I); return {RsrcRes, OffRes}; } PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { if (!isSplitFatPtr(PHI.getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(*PHI.getInsertionPointAfterDef()); // Phi nodes will be handled in post-processing after we've visited every // instruction. However, instead of just returning {nullptr, nullptr}, // we explicitly create the temporary extractvalue operations that are our // temporary results so that they end up at the beginning of the block with // the PHIs. Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc"); Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off"); Conditionals.push_back(&PHI); SplitUsers.insert(&PHI); return {TmpRsrc, TmpOff}; } PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { if (!isSplitFatPtr(SI.getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&SI); Value *Cond = SI.getCondition(); Value *True = SI.getTrueValue(); Value *False = SI.getFalseValue(); auto [TrueRsrc, TrueOff] = getPtrParts(True); auto [FalseRsrc, FalseOff] = getPtrParts(False); Value *RsrcRes = IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI); copyMetadata(RsrcRes, &SI); Conditionals.push_back(&SI); Value *OffRes = IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI); copyMetadata(OffRes, &SI); SplitUsers.insert(&SI); return {RsrcRes, OffRes}; } /// Returns true if this intrinsic needs to be removed when it is /// applied to `ptr addrspace(7)` values. Calls to these intrinsics are /// rewritten into calls to versions of that intrinsic on the resource /// descriptor. static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { switch (IID) { default: return false; case Intrinsic::ptrmask: case Intrinsic::invariant_start: case Intrinsic::invariant_end: case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: return true; } } PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { Intrinsic::ID IID = I.getIntrinsicID(); switch (IID) { default: break; case Intrinsic::ptrmask: { Value *Ptr = I.getArgOperand(0); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; Value *Mask = I.getArgOperand(1); IRB.SetInsertPoint(&I); auto [Rsrc, Off] = getPtrParts(Ptr); if (Mask->getType() != Off->getType()) report_fatal_error("offset width is not equal to index width of fat " "pointer (data layout not set up correctly?)"); Value *OffRes = IRB.CreateAnd(Off, Mask, I.getName() + ".off"); copyMetadata(OffRes, &I); SplitUsers.insert(&I); return {Rsrc, OffRes}; } // Pointer annotation intrinsics that, given their object-wide nature // operate on the resource part. case Intrinsic::invariant_start: { Value *Ptr = I.getArgOperand(1); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); auto [Rsrc, Off] = getPtrParts(Ptr); Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE); auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc}); copyMetadata(NewRsrc, &I); NewRsrc->takeName(&I); SplitUsers.insert(&I); I.replaceAllUsesWith(NewRsrc); return {nullptr, nullptr}; } case Intrinsic::invariant_end: { Value *RealPtr = I.getArgOperand(2); if (!isSplitFatPtr(RealPtr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); Value *RealRsrc = getPtrParts(RealPtr).first; Value *InvPtr = I.getArgOperand(0); Value *Size = I.getArgOperand(1); Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()}, {InvPtr, Size, RealRsrc}); copyMetadata(NewRsrc, &I); NewRsrc->takeName(&I); SplitUsers.insert(&I); I.replaceAllUsesWith(NewRsrc); return {nullptr, nullptr}; } case Intrinsic::launder_invariant_group: case Intrinsic::strip_invariant_group: { Value *Ptr = I.getArgOperand(0); if (!isSplitFatPtr(Ptr->getType())) return {nullptr, nullptr}; IRB.SetInsertPoint(&I); auto [Rsrc, Off] = getPtrParts(Ptr); Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc}); copyMetadata(NewRsrc, &I); NewRsrc->takeName(&I); SplitUsers.insert(&I); return {NewRsrc, Off}; } } return {nullptr, nullptr}; } void SplitPtrStructs::processFunction(Function &F) { ST = &TM->getSubtarget(F); SmallVector Originals; LLVM_DEBUG(dbgs() << "Splitting pointer structs in function: " << F.getName() << "\n"); for (Instruction &I : instructions(F)) Originals.push_back(&I); for (Instruction *I : Originals) { auto [Rsrc, Off] = visit(I); assert(((Rsrc && Off) || (!Rsrc && !Off)) && "Can't have a resource but no offset"); if (Rsrc) RsrcParts[I] = Rsrc; if (Off) OffParts[I] = Off; } processConditionals(); killAndReplaceSplitInstructions(Originals); // Clean up after ourselves to save on memory. RsrcParts.clear(); OffParts.clear(); SplitUsers.clear(); Conditionals.clear(); ConditionalTemps.clear(); } namespace { class AMDGPULowerBufferFatPointers : public ModulePass { public: static char ID; AMDGPULowerBufferFatPointers() : ModulePass(ID) { initializeAMDGPULowerBufferFatPointersPass( *PassRegistry::getPassRegistry()); } bool run(Module &M, const TargetMachine &TM); bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override; }; } // namespace /// Returns true if there are values that have a buffer fat pointer in them, /// which means we'll need to perform rewrites on this function. As a side /// effect, this will populate the type remapping cache. static bool containsBufferFatPointers(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { bool HasFatPointers = false; for (const BasicBlock &BB : F) for (const Instruction &I : BB) HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); return HasFatPointers; } static bool hasFatPointerInterface(const Function &F, BufferFatPtrToStructTypeMap *TypeMap) { Type *Ty = F.getFunctionType(); return Ty != TypeMap->remapType(Ty); } /// Move the body of `OldF` into a new function, returning it. static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, ValueToValueMapTy &CloneMap) { bool IsIntrinsic = OldF->isIntrinsic(); Function *NewF = Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace()); NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat; NewF->copyAttributesFrom(OldF); NewF->copyMetadata(OldF, 0); NewF->takeName(OldF); NewF->updateAfterNameChange(); NewF->setDLLStorageClass(OldF->getDLLStorageClass()); OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); while (!OldF->empty()) { BasicBlock *BB = &OldF->front(); BB->removeFromParent(); BB->insertInto(NewF); CloneMap[BB] = BB; for (Instruction &I : *BB) { CloneMap[&I] = &I; } } AttributeMask PtrOnlyAttrs; for (auto K : {Attribute::Dereferenceable, Attribute::DereferenceableOrNull, Attribute::NoAlias, Attribute::NoCapture, Attribute::NoFree, Attribute::NonNull, Attribute::NullPointerIsValid, Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly}) { PtrOnlyAttrs.addAttribute(K); } SmallVector ArgAttrs; AttributeList OldAttrs = OldF->getAttributes(); for (auto [I, OldArg, NewArg] : enumerate(OldF->args(), NewF->args())) { CloneMap[&NewArg] = &OldArg; NewArg.takeName(&OldArg); Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType(); // Temporarily mutate type of `NewArg` to allow RAUW to work. NewArg.mutateType(OldArgTy); OldArg.replaceAllUsesWith(&NewArg); NewArg.mutateType(NewArgTy); AttributeSet ArgAttr = OldAttrs.getParamAttrs(I); // Intrinsics get their attributes fixed later. if (OldArgTy != NewArgTy && !IsIntrinsic) ArgAttr = ArgAttr.removeAttributes(NewF->getContext(), PtrOnlyAttrs); ArgAttrs.push_back(ArgAttr); } AttributeSet RetAttrs = OldAttrs.getRetAttrs(); if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic) RetAttrs = RetAttrs.removeAttributes(NewF->getContext(), PtrOnlyAttrs); NewF->setAttributes(AttributeList::get( NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs)); return NewF; } static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { for (Argument &A : F->args()) CloneMap[&A] = &A; for (BasicBlock &BB : *F) { CloneMap[&BB] = &BB; for (Instruction &I : BB) CloneMap[&I] = &I; } } bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { bool Changed = false; const DataLayout &DL = M.getDataLayout(); // Record the functions which need to be remapped. // The second element of the pair indicates whether the function has to have // its arguments or return types adjusted. SmallVector> NeedsRemap; BufferFatPtrToStructTypeMap StructTM(DL); BufferFatPtrToIntTypeMap IntTM(DL); for (const GlobalVariable &GV : M.globals()) { if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) report_fatal_error("Global variables with a buffer fat pointer address " "space (7) are not supported"); Type *VT = GV.getValueType(); if (VT != StructTM.remapType(VT)) report_fatal_error("Global variables that contain buffer fat pointers " "(address space 7 pointers) are unsupported. Use " "buffer resource pointers (address space 8) instead."); } { // Collect all constant exprs and aggregates referenced by any function. SmallVector Worklist; for (Function &F : M.functions()) for (Instruction &I : instructions(F)) for (Value *Op : I.operands()) if (isa(Op) || isa(Op)) Worklist.push_back(cast(Op)); // Recursively look for any referenced buffer pointer constants. SmallPtrSet Visited; SetVector BufferFatPtrConsts; while (!Worklist.empty()) { Constant *C = Worklist.pop_back_val(); if (!Visited.insert(C).second) continue; if (isBufferFatPtrOrVector(C->getType())) BufferFatPtrConsts.insert(C); for (Value *Op : C->operands()) if (isa(Op) || isa(Op)) Worklist.push_back(cast(Op)); } // Expand all constant expressions using fat buffer pointers to // instructions. Changed |= convertUsersOfConstantsToInstructions( BufferFatPtrConsts.getArrayRef(), /*RestrictToFunc=*/nullptr, /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); } StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); for (Function &F : M.functions()) { bool InterfaceChange = hasFatPointerInterface(F, &StructTM); bool BodyChanges = containsBufferFatPointers(F, &StructTM); Changed |= MemOpsRewrite.processFunction(F); if (InterfaceChange || BodyChanges) NeedsRemap.push_back(std::make_pair(&F, InterfaceChange)); } if (NeedsRemap.empty()) return Changed; SmallVector NeedsPostProcess; SmallVector Intrinsics; // Keep one big map so as to memoize constants across functions. ValueToValueMapTy CloneMap; FatPtrConstMaterializer Materializer(&StructTM, CloneMap); ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); for (auto [F, InterfaceChange] : NeedsRemap) { Function *NewF = F; if (InterfaceChange) NewF = moveFunctionAdaptingType( F, cast(StructTM.remapType(F->getFunctionType())), CloneMap); else makeCloneInPraceMap(F, CloneMap); LowerInFuncs.remapFunction(*NewF); if (NewF->isIntrinsic()) Intrinsics.push_back(NewF); else NeedsPostProcess.push_back(NewF); if (InterfaceChange) { F->replaceAllUsesWith(NewF); F->eraseFromParent(); } Changed = true; } StructTM.clear(); IntTM.clear(); CloneMap.clear(); SplitPtrStructs Splitter(M.getContext(), &TM); for (Function *F : NeedsPostProcess) Splitter.processFunction(*F); for (Function *F : Intrinsics) { if (isRemovablePointerIntrinsic(F->getIntrinsicID())) { F->eraseFromParent(); } else { std::optional NewF = Intrinsic::remangleIntrinsicFunction(F); if (NewF) F->replaceAllUsesWith(*NewF); } } return Changed; } bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { TargetPassConfig &TPC = getAnalysis(); const TargetMachine &TM = TPC.getTM(); return run(M, TM); } char AMDGPULowerBufferFatPointers::ID = 0; char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID; void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); } #define PASS_DESC "Lower buffer fat pointer operations to buffer resources" INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, false) #undef PASS_DESC ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { return new AMDGPULowerBufferFatPointers(); } PreservedAnalyses AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none() : PreservedAnalyses::all(); }