//=== AArch64PostLegalizerCombiner.cpp --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// \file /// Post-legalization combines on generic MachineInstrs. /// /// The combines here must preserve instruction legality. /// /// Lowering combines (e.g. pseudo matching) should be handled by /// AArch64PostLegalizerLowering. /// /// Combines which don't rely on instruction legality should go in the /// AArch64PreLegalizerCombiner. /// //===----------------------------------------------------------------------===// #include "AArch64TargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" #define GET_GICOMBINER_DEPS #include "AArch64GenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_DEPS #define DEBUG_TYPE "aarch64-postlegalizer-combiner" using namespace llvm; using namespace MIPatternMatch; namespace { #define GET_GICOMBINER_TYPES #include "AArch64GenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_TYPES /// This combine tries do what performExtractVectorEltCombine does in SDAG. /// Rewrite for pairwise fadd pattern /// (s32 (g_extract_vector_elt /// (g_fadd (vXs32 Other) /// (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0)) /// -> /// (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0) /// (g_extract_vector_elt (vXs32 Other) 1)) bool matchExtractVecEltPairwiseAdd( MachineInstr &MI, MachineRegisterInfo &MRI, std::tuple &MatchInfo) { Register Src1 = MI.getOperand(1).getReg(); Register Src2 = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); auto Cst = getIConstantVRegValWithLookThrough(Src2, MRI); if (!Cst || Cst->Value != 0) return false; // SDAG also checks for FullFP16, but this looks to be beneficial anyway. // Now check for an fadd operation. TODO: expand this for integer add? auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI); if (!FAddMI) return false; // If we add support for integer add, must restrict these types to just s64. unsigned DstSize = DstTy.getSizeInBits(); if (DstSize != 16 && DstSize != 32 && DstSize != 64) return false; Register Src1Op1 = FAddMI->getOperand(1).getReg(); Register Src1Op2 = FAddMI->getOperand(2).getReg(); MachineInstr *Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI); MachineInstr *Other = MRI.getVRegDef(Src1Op1); if (!Shuffle) { Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI); Other = MRI.getVRegDef(Src1Op2); } // We're looking for a shuffle that moves the second element to index 0. if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 && Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) { std::get<0>(MatchInfo) = TargetOpcode::G_FADD; std::get<1>(MatchInfo) = DstTy; std::get<2>(MatchInfo) = Other->getOperand(0).getReg(); return true; } return false; } void applyExtractVecEltPairwiseAdd( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, std::tuple &MatchInfo) { unsigned Opc = std::get<0>(MatchInfo); assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!"); // We want to generate two extracts of elements 0 and 1, and add them. LLT Ty = std::get<1>(MatchInfo); Register Src = std::get<2>(MatchInfo); LLT s64 = LLT::scalar(64); B.setInstrAndDebugLoc(MI); auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0)); auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1)); B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1}); MI.eraseFromParent(); } bool isSignExtended(Register R, MachineRegisterInfo &MRI) { // TODO: check if extended build vector as well. unsigned Opc = MRI.getVRegDef(R)->getOpcode(); return Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG; } bool isZeroExtended(Register R, MachineRegisterInfo &MRI) { // TODO: check if extended build vector as well. return MRI.getVRegDef(R)->getOpcode() == TargetOpcode::G_ZEXT; } bool matchAArch64MulConstCombine( MachineInstr &MI, MachineRegisterInfo &MRI, std::function &ApplyFn) { assert(MI.getOpcode() == TargetOpcode::G_MUL); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); Register Dst = MI.getOperand(0).getReg(); const LLT Ty = MRI.getType(LHS); // The below optimizations require a constant RHS. auto Const = getIConstantVRegValWithLookThrough(RHS, MRI); if (!Const) return false; APInt ConstValue = Const->Value.sext(Ty.getSizeInBits()); // The following code is ported from AArch64ISelLowering. // Multiplication of a power of two plus/minus one can be done more // cheaply as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. // More aggressively, some multiplications N0 * C can be lowered to // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, // e.g. 6=3*2=(2+1)*2. // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 // which equals to (1+2)*16-(1+2). // TrailingZeroes is used to test if the mul can be lowered to // shift+add+shift. unsigned TrailingZeroes = ConstValue.countr_zero(); if (TrailingZeroes) { // Conservatively do not lower to shift+add+shift if the mul might be // folded into smul or umul. if (MRI.hasOneNonDBGUse(LHS) && (isSignExtended(LHS, MRI) || isZeroExtended(LHS, MRI))) return false; // Conservatively do not lower to shift+add+shift if the mul might be // folded into madd or msub. if (MRI.hasOneNonDBGUse(Dst)) { MachineInstr &UseMI = *MRI.use_instr_begin(Dst); unsigned UseOpc = UseMI.getOpcode(); if (UseOpc == TargetOpcode::G_ADD || UseOpc == TargetOpcode::G_PTR_ADD || UseOpc == TargetOpcode::G_SUB) return false; } } // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub // and shift+add+shift. APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); unsigned ShiftAmt, AddSubOpc; // Is the shifted value the LHS operand of the add/sub? bool ShiftValUseIsLHS = true; // Do we need to negate the result? bool NegateResult = false; if (ConstValue.isNonNegative()) { // (mul x, 2^N + 1) => (add (shl x, N), x) // (mul x, 2^N - 1) => (sub (shl x, N), x) // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) APInt SCVMinus1 = ShiftedConstValue - 1; APInt CVPlus1 = ConstValue + 1; if (SCVMinus1.isPowerOf2()) { ShiftAmt = SCVMinus1.logBase2(); AddSubOpc = TargetOpcode::G_ADD; } else if (CVPlus1.isPowerOf2()) { ShiftAmt = CVPlus1.logBase2(); AddSubOpc = TargetOpcode::G_SUB; } else return false; } else { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) // (mul x, -(2^N + 1)) => - (add (shl x, N), x) APInt CVNegPlus1 = -ConstValue + 1; APInt CVNegMinus1 = -ConstValue - 1; if (CVNegPlus1.isPowerOf2()) { ShiftAmt = CVNegPlus1.logBase2(); AddSubOpc = TargetOpcode::G_SUB; ShiftValUseIsLHS = false; } else if (CVNegMinus1.isPowerOf2()) { ShiftAmt = CVNegMinus1.logBase2(); AddSubOpc = TargetOpcode::G_ADD; NegateResult = true; } else return false; } if (NegateResult && TrailingZeroes) return false; ApplyFn = [=](MachineIRBuilder &B, Register DstReg) { auto Shift = B.buildConstant(LLT::scalar(64), ShiftAmt); auto ShiftedVal = B.buildShl(Ty, LHS, Shift); Register AddSubLHS = ShiftValUseIsLHS ? ShiftedVal.getReg(0) : LHS; Register AddSubRHS = ShiftValUseIsLHS ? LHS : ShiftedVal.getReg(0); auto Res = B.buildInstr(AddSubOpc, {Ty}, {AddSubLHS, AddSubRHS}); assert(!(NegateResult && TrailingZeroes) && "NegateResult and TrailingZeroes cannot both be true for now."); // Negate the result. if (NegateResult) { B.buildSub(DstReg, B.buildConstant(Ty, 0), Res); return; } // Shift the result. if (TrailingZeroes) { B.buildShl(DstReg, Res, B.buildConstant(LLT::scalar(64), TrailingZeroes)); return; } B.buildCopy(DstReg, Res.getReg(0)); }; return true; } void applyAArch64MulConstCombine( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, std::function &ApplyFn) { B.setInstrAndDebugLoc(MI); ApplyFn(B, MI.getOperand(0).getReg()); MI.eraseFromParent(); } /// Try to fold a G_MERGE_VALUES of 2 s32 sources, where the second source /// is a zero, into a G_ZEXT of the first. bool matchFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI) { auto &Merge = cast(MI); LLT SrcTy = MRI.getType(Merge.getSourceReg(0)); if (SrcTy != LLT::scalar(32) || Merge.getNumSources() != 2) return false; return mi_match(Merge.getSourceReg(1), MRI, m_SpecificICst(0)); } void applyFoldMergeToZext(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) { // Mutate %d(s64) = G_MERGE_VALUES %a(s32), 0(s32) // -> // %d(s64) = G_ZEXT %a(s32) Observer.changingInstr(MI); MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT)); MI.removeOperand(2); Observer.changedInstr(MI); } /// \returns True if a G_ANYEXT instruction \p MI should be mutated to a G_ZEXT /// instruction. bool matchMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI) { // If this is coming from a scalar compare then we can use a G_ZEXT instead of // a G_ANYEXT: // // %cmp:_(s32) = G_[I|F]CMP ... <-- produces 0/1. // %ext:_(s64) = G_ANYEXT %cmp(s32) // // By doing this, we can leverage more KnownBits combines. assert(MI.getOpcode() == TargetOpcode::G_ANYEXT); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); return MRI.getType(Dst).isScalar() && mi_match(Src, MRI, m_any_of(m_GICmp(m_Pred(), m_Reg(), m_Reg()), m_GFCmp(m_Pred(), m_Reg(), m_Reg()))); } void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) { Observer.changingInstr(MI); MI.setDesc(B.getTII().get(TargetOpcode::G_ZEXT)); Observer.changedInstr(MI); } /// Match a 128b store of zero and split it into two 64 bit stores, for /// size/performance reasons. bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) { GStore &Store = cast(MI); if (!Store.isSimple()) return false; LLT ValTy = MRI.getType(Store.getValueReg()); if (ValTy.isScalableVector()) return false; if (!ValTy.isVector() || ValTy.getSizeInBits() != 128) return false; if (Store.getMemSizeInBits() != ValTy.getSizeInBits()) return false; // Don't split truncating stores. if (!MRI.hasOneNonDBGUse(Store.getValueReg())) return false; auto MaybeCst = isConstantOrConstantSplatVector( *MRI.getVRegDef(Store.getValueReg()), MRI); return MaybeCst && MaybeCst->isZero(); } void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) { B.setInstrAndDebugLoc(MI); GStore &Store = cast(MI); assert(MRI.getType(Store.getValueReg()).isVector() && "Expected a vector store value"); LLT NewTy = LLT::scalar(64); Register PtrReg = Store.getPointerReg(); auto Zero = B.buildConstant(NewTy, 0); auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg, B.buildConstant(LLT::scalar(64), 8)); auto &MF = *MI.getMF(); auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy); auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy); B.buildStore(Zero, PtrReg, *LowMMO); B.buildStore(Zero, HighPtr, *HighMMO); Store.eraseFromParent(); } bool matchOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI, std::tuple &MatchInfo) { const LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); if (!DstTy.isVector()) return false; Register AO1, AO2, BVO1, BVO2; if (!mi_match(MI, MRI, m_GOr(m_GAnd(m_Reg(AO1), m_Reg(BVO1)), m_GAnd(m_Reg(AO2), m_Reg(BVO2))))) return false; auto *BV1 = getOpcodeDef(BVO1, MRI); auto *BV2 = getOpcodeDef(BVO2, MRI); if (!BV1 || !BV2) return false; for (int I = 0, E = DstTy.getNumElements(); I < E; I++) { auto ValAndVReg1 = getIConstantVRegValWithLookThrough(BV1->getSourceReg(I), MRI); auto ValAndVReg2 = getIConstantVRegValWithLookThrough(BV2->getSourceReg(I), MRI); if (!ValAndVReg1 || !ValAndVReg2 || ValAndVReg1->Value != ~ValAndVReg2->Value) return false; } MatchInfo = {AO1, AO2, BVO1}; return true; } void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, std::tuple &MatchInfo) { B.setInstrAndDebugLoc(MI); B.buildInstr( AArch64::G_BSP, {MI.getOperand(0).getReg()}, {std::get<2>(MatchInfo), std::get<0>(MatchInfo), std::get<1>(MatchInfo)}); MI.eraseFromParent(); } // Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, Register &SrcReg) { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) && DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) && DstTy != LLT::fixed_vector(8, 16)) return false; auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); if (AndMI->getOpcode() != TargetOpcode::G_AND) return false; auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI); if (LShrMI->getOpcode() != TargetOpcode::G_LSHR) return false; // Check the constant splat values auto V1 = isConstantOrConstantSplatVector( *MRI.getVRegDef(MI.getOperand(2).getReg()), MRI); auto V2 = isConstantOrConstantSplatVector( *MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI); auto V3 = isConstantOrConstantSplatVector( *MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI); if (!V1.has_value() || !V2.has_value() || !V3.has_value()) return false; unsigned HalfSize = DstTy.getScalarSizeInBits() / 2; if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) || V3 != (HalfSize - 1)) return false; SrcReg = LShrMI->getOperand(1).getReg(); return true; } void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, Register &SrcReg) { Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); LLT HalfTy = DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2)) .changeElementSize(DstTy.getScalarSizeInBits() / 2); Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0); Register CastReg = B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0); Register CMLTReg = B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec) .getReg(0); B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0); MI.eraseFromParent(); } class AArch64PostLegalizerCombinerImpl : public Combiner { protected: // TODO: Make CombinerHelper methods const. mutable CombinerHelper Helper; const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig; const AArch64Subtarget &STI; public: AArch64PostLegalizerCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig, const AArch64Subtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI); static const char *getName() { return "AArch64PostLegalizerCombiner"; } bool tryCombineAll(MachineInstr &I) const override; private: #define GET_GICOMBINER_CLASS_MEMBERS #include "AArch64GenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_CLASS_MEMBERS }; #define GET_GICOMBINER_IMPL #include "AArch64GenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_IMPL AArch64PostLegalizerCombinerImpl::AArch64PostLegalizerCombinerImpl( MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, GISelKnownBits &KB, GISelCSEInfo *CSEInfo, const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig, const AArch64Subtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) : Combiner(MF, CInfo, TPC, &KB, CSEInfo), Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), RuleConfig(RuleConfig), STI(STI), #define GET_GICOMBINER_CONSTRUCTOR_INITS #include "AArch64GenPostLegalizeGICombiner.inc" #undef GET_GICOMBINER_CONSTRUCTOR_INITS { } class AArch64PostLegalizerCombiner : public MachineFunctionPass { public: static char ID; AArch64PostLegalizerCombiner(bool IsOptNone = false); StringRef getPassName() const override { return "AArch64PostLegalizerCombiner"; } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; private: bool IsOptNone; AArch64PostLegalizerCombinerImplRuleConfig RuleConfig; struct StoreInfo { GStore *St = nullptr; // The G_PTR_ADD that's used by the store. We keep this to cache the // MachineInstr def. GPtrAdd *Ptr = nullptr; // The signed offset to the Ptr instruction. int64_t Offset = 0; LLT StoredType; }; bool tryOptimizeConsecStores(SmallVectorImpl &Stores, CSEMIRBuilder &MIB); bool optimizeConsecutiveMemOpAddressing(MachineFunction &MF, CSEMIRBuilder &MIB); }; } // end anonymous namespace void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); } MachineFunctionPass::getAnalysisUsage(AU); } AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) return false; assert(MF.getProperties().hasProperty( MachineFunctionProperties::Property::Legalized) && "Expected a legalized function?"); auto *TPC = &getAnalysis(); const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); const AArch64Subtarget &ST = MF.getSubtarget(); const auto *LI = ST.getLegalizerInfo(); GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis().getDomTree(); GISelCSEAnalysisWrapper &Wrapper = getAnalysis().getCSEWrapper(); auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); AArch64PostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, ST, MDT, LI); bool Changed = Impl.combineMachineInstrs(); auto MIB = CSEMIRBuilder(MF); MIB.setCSEInfo(CSEInfo); Changed |= optimizeConsecutiveMemOpAddressing(MF, MIB); return Changed; } bool AArch64PostLegalizerCombiner::tryOptimizeConsecStores( SmallVectorImpl &Stores, CSEMIRBuilder &MIB) { if (Stores.size() <= 2) return false; // Profitabity checks: int64_t BaseOffset = Stores[0].Offset; unsigned NumPairsExpected = Stores.size() / 2; unsigned TotalInstsExpected = NumPairsExpected + (Stores.size() % 2); // Size savings will depend on whether we can fold the offset, as an // immediate of an ADD. auto &TLI = *MIB.getMF().getSubtarget().getTargetLowering(); if (!TLI.isLegalAddImmediate(BaseOffset)) TotalInstsExpected++; int SavingsExpected = Stores.size() - TotalInstsExpected; if (SavingsExpected <= 0) return false; auto &MRI = MIB.getMF().getRegInfo(); // We have a series of consecutive stores. Factor out the common base // pointer and rewrite the offsets. Register NewBase = Stores[0].Ptr->getReg(0); for (auto &SInfo : Stores) { // Compute a new pointer with the new base ptr and adjusted offset. MIB.setInstrAndDebugLoc(*SInfo.St); auto NewOff = MIB.buildConstant(LLT::scalar(64), SInfo.Offset - BaseOffset); auto NewPtr = MIB.buildPtrAdd(MRI.getType(SInfo.St->getPointerReg()), NewBase, NewOff); if (MIB.getObserver()) MIB.getObserver()->changingInstr(*SInfo.St); SInfo.St->getOperand(1).setReg(NewPtr.getReg(0)); if (MIB.getObserver()) MIB.getObserver()->changedInstr(*SInfo.St); } LLVM_DEBUG(dbgs() << "Split a series of " << Stores.size() << " stores into a base pointer and offsets.\n"); return true; } static cl::opt EnableConsecutiveMemOpOpt("aarch64-postlegalizer-consecutive-memops", cl::init(true), cl::Hidden, cl::desc("Enable consecutive memop optimization " "in AArch64PostLegalizerCombiner")); bool AArch64PostLegalizerCombiner::optimizeConsecutiveMemOpAddressing( MachineFunction &MF, CSEMIRBuilder &MIB) { // This combine needs to run after all reassociations/folds on pointer // addressing have been done, specifically those that combine two G_PTR_ADDs // with constant offsets into a single G_PTR_ADD with a combined offset. // The goal of this optimization is to undo that combine in the case where // doing so has prevented the formation of pair stores due to illegal // addressing modes of STP. The reason that we do it here is because // it's much easier to undo the transformation of a series consecutive // mem ops, than it is to detect when doing it would be a bad idea looking // at a single G_PTR_ADD in the reassociation/ptradd_immed_chain combine. // // An example: // G_STORE %11:_(<2 x s64>), %base:_(p0) :: (store (<2 x s64>), align 1) // %off1:_(s64) = G_CONSTANT i64 4128 // %p1:_(p0) = G_PTR_ADD %0:_, %off1:_(s64) // G_STORE %11:_(<2 x s64>), %p1:_(p0) :: (store (<2 x s64>), align 1) // %off2:_(s64) = G_CONSTANT i64 4144 // %p2:_(p0) = G_PTR_ADD %0:_, %off2:_(s64) // G_STORE %11:_(<2 x s64>), %p2:_(p0) :: (store (<2 x s64>), align 1) // %off3:_(s64) = G_CONSTANT i64 4160 // %p3:_(p0) = G_PTR_ADD %0:_, %off3:_(s64) // G_STORE %11:_(<2 x s64>), %17:_(p0) :: (store (<2 x s64>), align 1) bool Changed = false; auto &MRI = MF.getRegInfo(); if (!EnableConsecutiveMemOpOpt) return Changed; SmallVector Stores; // If we see a load, then we keep track of any values defined by it. // In the following example, STP formation will fail anyway because // the latter store is using a load result that appears after the // the prior store. In this situation if we factor out the offset then // we increase code size for no benefit. // G_STORE %v1:_(s64), %base:_(p0) :: (store (s64)) // %v2:_(s64) = G_LOAD %ldptr:_(p0) :: (load (s64)) // G_STORE %v2:_(s64), %base:_(p0) :: (store (s64)) SmallVector LoadValsSinceLastStore; auto storeIsValid = [&](StoreInfo &Last, StoreInfo New) { // Check if this store is consecutive to the last one. if (Last.Ptr->getBaseReg() != New.Ptr->getBaseReg() || (Last.Offset + static_cast(Last.StoredType.getSizeInBytes()) != New.Offset) || Last.StoredType != New.StoredType) return false; // Check if this store is using a load result that appears after the // last store. If so, bail out. if (any_of(LoadValsSinceLastStore, [&](Register LoadVal) { return New.St->getValueReg() == LoadVal; })) return false; // Check if the current offset would be too large for STP. // If not, then STP formation should be able to handle it, so we don't // need to do anything. int64_t MaxLegalOffset; switch (New.StoredType.getSizeInBits()) { case 32: MaxLegalOffset = 252; break; case 64: MaxLegalOffset = 504; break; case 128: MaxLegalOffset = 1008; break; default: llvm_unreachable("Unexpected stored type size"); } if (New.Offset < MaxLegalOffset) return false; // If factoring it out still wouldn't help then don't bother. return New.Offset - Stores[0].Offset <= MaxLegalOffset; }; auto resetState = [&]() { Stores.clear(); LoadValsSinceLastStore.clear(); }; for (auto &MBB : MF) { // We're looking inside a single BB at a time since the memset pattern // should only be in a single block. resetState(); for (auto &MI : MBB) { // Skip for scalable vectors if (auto *LdSt = dyn_cast(&MI); LdSt && MRI.getType(LdSt->getOperand(0).getReg()).isScalableVector()) continue; if (auto *St = dyn_cast(&MI)) { Register PtrBaseReg; APInt Offset; LLT StoredValTy = MRI.getType(St->getValueReg()); unsigned ValSize = StoredValTy.getSizeInBits(); if (ValSize < 32 || St->getMMO().getSizeInBits() != ValSize) continue; Register PtrReg = St->getPointerReg(); if (mi_match( PtrReg, MRI, m_OneNonDBGUse(m_GPtrAdd(m_Reg(PtrBaseReg), m_ICst(Offset))))) { GPtrAdd *PtrAdd = cast(MRI.getVRegDef(PtrReg)); StoreInfo New = {St, PtrAdd, Offset.getSExtValue(), StoredValTy}; if (Stores.empty()) { Stores.push_back(New); continue; } // Check if this store is a valid continuation of the sequence. auto &Last = Stores.back(); if (storeIsValid(Last, New)) { Stores.push_back(New); LoadValsSinceLastStore.clear(); // Reset the load value tracking. } else { // The store isn't a valid to consider for the prior sequence, // so try to optimize what we have so far and start a new sequence. Changed |= tryOptimizeConsecStores(Stores, MIB); resetState(); Stores.push_back(New); } } } else if (auto *Ld = dyn_cast(&MI)) { LoadValsSinceLastStore.push_back(Ld->getDstReg()); } } Changed |= tryOptimizeConsecStores(Stores, MIB); resetState(); } return Changed; } char AArch64PostLegalizerCombiner::ID = 0; INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 MachineInstrs after legalization", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE, "Combine AArch64 MachineInstrs after legalization", false, false) namespace llvm { FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone) { return new AArch64PostLegalizerCombiner(IsOptNone); } } // end namespace llvm