//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "MCTargetDesc/X86ShuffleDecode.h" #include "X86.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" static cl::opt ExperimentalPrefInnermostLoopAlignment( "x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc( "Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden); static cl::opt BrMergingBaseCostThresh( "x86-br-merging-base-cost", cl::init(2), cl::desc( "Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden); static cl::opt BrMergingCcmpBias( "x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden); static cl::opt BrMergingLikelyBias( "x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden); static cl::opt BrMergingUnlikelyBias( "x86-br-merging-unlikely-bias", cl::init(-1), cl::desc( "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden); static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden); X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // X86 instruction cache is coherent with its data cache so we can use the // default expansion to a no-op. setOperationAction(ISD::CLEAR_CACHE, MVT::Other, Expand); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget.isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget.is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOptLevel::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) addBypassSlowDiv(64, 32); } // Setup Windows compiler runtime calls. if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall }, { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall }, { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall }, { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall }, { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } if (Subtarget.canUseCMPXCHG16B()) setMaxAtomicSizeInBitsSupported(128); else if (Subtarget.canUseCMPXCHG8B()) setMaxAtomicSizeInBitsSupported(64); else setMaxAtomicSizeInBitsSupported(32); setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64); setMaxLargeFPConvertBitWidthSupported(128); // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget.is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) { setCondCodeAction(ISD::SETOEQ, VT, Expand); setCondCodeAction(ISD::SETUNE, VT, Expand); } // Integer absolute. if (Subtarget.canUseCMOV()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::ABS , MVT::i64 , Custom); } // Absolute difference. for (auto Op : {ISD::ABDS, ISD::ABDU}) { setOperationAction(Op , MVT::i8 , Custom); setOperationAction(Op , MVT::i16 , Custom); setOperationAction(Op , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(Op , MVT::i64 , Custom); } // Signed saturation subtraction. setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom); setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom); setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { // For slow shld targets we only lower for code size. LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal; setOperationAction(ShiftOp , MVT::i8 , Custom); setOperationAction(ShiftOp , MVT::i16 , Custom); setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction); if (Subtarget.is64Bit()) setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction); } if (!Subtarget.useSoftFloat()) { // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); // SSE has no i16 to fp conversion, only i32. We promote in the handler // to allow f80 to use i16 and f64 to use i16 with sse1 only setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::LRINT, MVT::f32, Custom); setOperationAction(ISD::LRINT, MVT::f64, Custom); setOperationAction(ISD::LLRINT, MVT::f32, Custom); setOperationAction(ISD::LLRINT, MVT::f64, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::LRINT, MVT::i64, Custom); setOperationAction(ISD::LLRINT, MVT::i64, Custom); } } if (Subtarget.hasSSE2()) { // Custom lowering for saturating float to int conversions. // We handle promotion to larger result types manually. for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) { setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); } } // Handle address space casts between mixed sized pointers. setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } else if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::BR_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); } if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) { setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom); setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom); setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom); setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom); setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom); } // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to // promote that too. setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32); if (!Subtarget.hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } } if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::CTLZ , VT, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); } } for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16, ISD::STRICT_FP_TO_FP16}) { // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. setOperationAction( Op, MVT::f32, (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand); // There's never any support for operations beyond MVT::f32. setOperationAction(Op, MVT::f64, Expand); setOperationAction(Op, MVT::f80, Expand); setOperationAction(Op, MVT::f128, Expand); } for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand); setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand); } for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand); setTruncStoreAction(VT, MVT::f16, Expand); setTruncStoreAction(VT, MVT::bf16, Expand); setOperationAction(ISD::BF16_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_BF16, VT, Custom); } setOperationAction(ISD::PARITY, MVT::i8, Custom); setOperationAction(ISD::PARITY, MVT::i16, Custom); setOperationAction(ISD::PARITY, MVT::i32, Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::PARITY, MVT::i64, Custom); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); // popcntw is longer to encode than popcntl and also has a false dependency // on the dest that popcntl hasn't had since Cannon Lake. setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Custom); setOperationAction(ISD::CTPOP , MVT::i16 , Custom); setOperationAction(ISD::CTPOP , MVT::i32 , Custom); setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); // Darwin ABI issue. for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::ConstantPool , VT, Custom); setOperationAction(ISD::JumpTable , VT, Custom); setOperationAction(ISD::GlobalAddress , VT, Custom); setOperationAction(ISD::GlobalTLSAddress, VT, Custom); setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SHL_PARTS, VT, Custom); setOperationAction(ISD::SRA_PARTS, VT, Custom); setOperationAction(ISD::SRL_PARTS, VT, Custom); } if (Subtarget.hasSSEPrefetch()) setOperationAction(ISD::PREFETCH , MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); if (Subtarget.is64Bit() && Subtarget.hasAVX()) { // All CPUs supporting AVX will atomically load/store aligned 128-bit // values, so we can emit [V]MOVAPS/[V]MOVDQA. setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); } if (Subtarget.canUseCMPXCHG16B()) setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); // FIXME - use subtarget debug flags if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && TM.Options.ExceptionModel != ExceptionHandling::SjLj) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); if (Subtarget.isTargetPS()) setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand); else setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); bool Is64Bit = Subtarget.is64Bit(); setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); auto setF16Action = [&] (MVT VT, LegalizeAction Action) { setOperationAction(ISD::FABS, VT, Action); setOperationAction(ISD::FNEG, VT, Action); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FREM, VT, Action); setOperationAction(ISD::FMA, VT, Action); setOperationAction(ISD::FMINNUM, VT, Action); setOperationAction(ISD::FMAXNUM, VT, Action); setOperationAction(ISD::FMINIMUM, VT, Action); setOperationAction(ISD::FMAXIMUM, VT, Action); setOperationAction(ISD::FSIN, VT, Action); setOperationAction(ISD::FCOS, VT, Action); setOperationAction(ISD::FSINCOS, VT, Action); setOperationAction(ISD::FTAN, VT, Action); setOperationAction(ISD::FSQRT, VT, Action); setOperationAction(ISD::FPOW, VT, Action); setOperationAction(ISD::FLOG, VT, Action); setOperationAction(ISD::FLOG2, VT, Action); setOperationAction(ISD::FLOG10, VT, Action); setOperationAction(ISD::FEXP, VT, Action); setOperationAction(ISD::FEXP2, VT, Action); setOperationAction(ISD::FEXP10, VT, Action); setOperationAction(ISD::FCEIL, VT, Action); setOperationAction(ISD::FFLOOR, VT, Action); setOperationAction(ISD::FNEARBYINT, VT, Action); setOperationAction(ISD::FRINT, VT, Action); setOperationAction(ISD::BR_CC, VT, Action); setOperationAction(ISD::SETCC, VT, Action); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Action); setOperationAction(ISD::FROUND, VT, Action); setOperationAction(ISD::FROUNDEVEN, VT, Action); setOperationAction(ISD::FTRUNC, VT, Action); setOperationAction(ISD::FLDEXP, VT, Action); }; if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { // f16, f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass); addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); // Disable f32->f64 extload as we can only generate this in one instruction // under optsize. So its easier to pattern match (fpext (load)) for that // case instead of needing to emit 2 instructions for extload in the // non-optsize case. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG, VT, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); // These might be better off as horizontal vector ops. setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } // Half type will be promoted by default. setF16Action(MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote); setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote); setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote); setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote); setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote); setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote); setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() && (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); if (UseX87) addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); if (UseX87) setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. if (UseX87) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (UseX87) { // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } } // Expand FP32 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f32)) { if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) { addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0f)); // xorps } // Expand FP64 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f64)) { if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) { addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } // Support fp16 0 immediate. if (isTypeLegal(MVT::f16)) addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // f80 always uses X87. if (UseX87) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } // Always expand sin/cos functions even though x87 has an instruction. // clang-format off setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); setOperationAction(ISD::FTAN , MVT::f80, Expand); setOperationAction(ISD::FASIN , MVT::f80, Expand); setOperationAction(ISD::FACOS , MVT::f80, Expand); setOperationAction(ISD::FATAN , MVT::f80, Expand); setOperationAction(ISD::FSINH , MVT::f80, Expand); setOperationAction(ISD::FCOSH , MVT::f80, Expand); setOperationAction(ISD::FTANH , MVT::f80, Expand); // clang-format on setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); setOperationAction(ISD::LROUND, MVT::f80, Expand); setOperationAction(ISD::LLROUND, MVT::f80, Expand); setOperationAction(ISD::LRINT, MVT::f80, Custom); setOperationAction(ISD::LLRINT, MVT::f80, Custom); // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); if (isTypeLegal(MVT::f16)) { setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); } else { setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); } // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten // as Custom. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); } // f128 uses xmm registers, but most operations require libcalls. if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); setOperationAction(ISD::FABS, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); // clang-format off setOperationAction(ISD::FSIN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); setOperationAction(ISD::FCOS, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); setOperationAction(ISD::FTAN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall); // clang-format on // No STRICT_FSINCOS setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); // We need to custom handle any FP_ROUND with an f128 input, but // LegalizeDAG uses the result type to know when to run a custom handler. // So we have to list all legal floating point result types here. if (isTypeLegal(MVT::f32)) { setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); } if (isTypeLegal(MVT::f64)) { setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); } if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); } setOperationAction(ISD::SETCC, MVT::f128, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FPOW , MVT::f128 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FEXP10, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { // clang-format off setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FTAN, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FEXP10, VT, Expand); // clang-format on } // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::FROUNDEVEN, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16 || VT.getVectorElementType() == MVT::bf16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom); setOperationAction(ISD::FMINIMUM, MVT::f32, Custom); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); } for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SREM, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UREM, VT, Custom); } setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v4i32, Custom); setOperationAction(ISD::MULHS, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i8, Custom); setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); setOperationAction(ISD::LRINT, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } setOperationAction(ISD::SETCC, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v4f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); if (VT == MVT::v2i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } setF16Action(MVT::v8f16, Expand); setOperationAction(ISD::FADD, MVT::v8f16, Expand); setOperationAction(ISD::FSUB, MVT::v8f16, Expand); setOperationAction(ISD::FMUL, MVT::v8f16, Expand); setOperationAction(ISD::FDIV, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Custom); setOperationAction(ISD::FABS, MVT::v8f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom); // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); // Add 32-bit vector stores to help vectorization opportunities. setOperationAction(ISD::STORE, MVT::v2i16, Custom); setOperationAction(ISD::STORE, MVT::v4i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); if (VT == MVT::v2i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } if (Subtarget.hasGFNI()) { setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); setOperationAction(ISD::ABS, MVT::v4i32, Legal); for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); } // These might be better off as horizontal vector ops. setOperationAction(ISD::ADD, MVT::i16, Custom); setOperationAction(ISD::ADD, MVT::i32, Custom); setOperationAction(ISD::SUB, MVT::i16, Custom); setOperationAction(ISD::SUB, MVT::i32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); setOperationAction(ISD::FROUND, RoundedTy, Custom); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); setOperationAction(ISD::SMULO, MVT::v2i32, Custom); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); } if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can // do the pre and post work in the vector domain. setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); // We need to mark SINT_TO_FP as Custom even though we want to expand it // so that DAG combine doesn't try to turn it into uint_to_fp. setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // XOP can efficiently perform BITREVERSE with VPPERM. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { bool HasInt256 = Subtarget.hasInt256(); addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v8f32, Custom); setOperationAction(ISD::LRINT, MVT::v4f64, Custom); // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); if (VT == MVT::v4i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } // These types need custom splitting if their input is a 128-bit vector. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); setOperationAction(ISD::SELECT, MVT::v16i16, Custom); setOperationAction(ISD::SELECT, MVT::v16f16, Custom); setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } setOperationAction(ISD::SETCC, MVT::v4f64, Custom); setOperationAction(ISD::SETCC, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom); if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); } } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); } setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i32, Custom); setOperationAction(ISD::MULHS, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v32i8, Custom); setOperationAction(ISD::UMULO, MVT::v32i8, Custom); setOperationAction(ISD::ABS, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v4i64, Custom); setOperationAction(ISD::UMAX, MVT::v4i64, Custom); setOperationAction(ISD::SMIN, MVT::v4i64, Custom); setOperationAction(ISD::UMIN, MVT::v4i64, Custom); setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } if (HasInt256) { // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); } } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } setF16Action(MVT::v16f16, Expand); setOperationAction(ISD::FNEG, MVT::v16f16, Custom); setOperationAction(ISD::FABS, MVT::v16f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v16f16, Custom); setOperationAction(ISD::FADD, MVT::v16f16, Expand); setOperationAction(ISD::FSUB, MVT::v16f16, Expand); setOperationAction(ISD::FMUL, MVT::v16f16, Expand); setOperationAction(ISD::FDIV, MVT::v16f16, Expand); if (HasInt256) { setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MGATHER, VT, Custom); } } if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && Subtarget.hasF16C()) { for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); } for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) { setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); } } // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { setOperationAction(ISD::LOAD, MVT::v1i1, Custom); setOperationAction(ISD::LOAD, MVT::v2i1, Custom); setOperationAction(ISD::LOAD, MVT::v4i1, Custom); setOperationAction(ISD::LOAD, MVT::v8i1, Custom); setOperationAction(ISD::STORE, MVT::v1i1, Custom); setOperationAction(ISD::STORE, MVT::v2i1, Custom); setOperationAction(ISD::STORE, MVT::v4i1, Custom); setOperationAction(ISD::STORE, MVT::v8i1, Custom); } // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) setOperationAction(ISD::VSELECT, VT, Expand); for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } if (Subtarget.hasDQI() && Subtarget.hasVLX()) { for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::LRINT, VT, Legal); setOperationAction(ISD::LLRINT, VT, Legal); } } // This block controls legalization for 512-bit operations with 8/16/32/64 bit // elements. 512-bits can be disabled based on prefer-vector-width and // required-vector-width function attributes. if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { bool HasBWI = Subtarget.hasBWI(); addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v32f16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); if (HasBWI) setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); setOperationAction(ISD::LRINT, MVT::v8f64, Subtarget.hasDQI() ? Legal : Custom); if (Subtarget.hasDQI()) setOperationAction(ISD::LLRINT, MVT::v8f64, Legal); for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); } for (MVT VT : { MVT::v16i16, MVT::v16i32 }) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); if (HasBWI) setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. if (!Subtarget.hasVLX()) { for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); if (HasBWI) { // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); } for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); } for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } setOperationAction(ISD::SETCC, MVT::v8f64, Custom); setOperationAction(ISD::SETCC, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); } setOperationAction(ISD::FSHL, MVT::v64i8, Custom); setOperationAction(ISD::FSHR, MVT::v64i8, Custom); setOperationAction(ISD::FSHL, MVT::v32i16, Custom); setOperationAction(ISD::FSHR, MVT::v32i16, Custom); setOperationAction(ISD::FSHL, MVT::v16i32, Custom); setOperationAction(ISD::FSHR, MVT::v16i32, Custom); if (Subtarget.hasDQI()) { for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) setOperationAction(Opc, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Legal under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); } setF16Action(MVT::v32f16, Expand); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom); for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } if (HasBWI) { for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } } else { setOperationAction(ISD::STORE, MVT::v32i16, Custom); setOperationAction(ISD::STORE, MVT::v64i8, Custom); } if (Subtarget.hasVBMI2()) { for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } setOperationAction(ISD::ROTL, MVT::v32i16, Custom); setOperationAction(ISD::ROTR, MVT::v32i16, Custom); } setOperationAction(ISD::FNEG, MVT::v32f16, Custom); setOperationAction(ISD::FABS, MVT::v32f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom); }// useAVX512Regs if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) { for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } } // This block controls legalization for operations that don't have // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) { setOperationAction(Opc, MVT::v2i64, Custom); setOperationAction(Opc, MVT::v4i64, Custom); } setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); } if (Subtarget.hasCDI()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } } // This block control legalization of v32i1/v64i1 which are available with // AVX512BW.. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); for (auto VT : { MVT::v32i1, MVT::v64i1 }) { setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } for (auto VT : { MVT::v16i1, MVT::v32i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); } // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { auto setGroup = [&] (MVT VT) { setOperationAction(ISD::FADD, VT, Legal); setOperationAction(ISD::STRICT_FADD, VT, Legal); setOperationAction(ISD::FSUB, VT, Legal); setOperationAction(ISD::STRICT_FSUB, VT, Legal); setOperationAction(ISD::FMUL, VT, Legal); setOperationAction(ISD::STRICT_FMUL, VT, Legal); setOperationAction(ISD::FDIV, VT, Legal); setOperationAction(ISD::STRICT_FDIV, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::STRICT_FSQRT, VT, Legal); setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); }; // AVX512_FP16 scalar operations setGroup(MVT::f16); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::BR_CC, MVT::f16, Expand); setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom); setOperationAction(ISD::FMINIMUM, MVT::f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1, MVT::v32i16); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); } if (Subtarget.hasVLX()) { setGroup(MVT::v8f16); setGroup(MVT::v16f16); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal); // Need to custom widen these to prevent scalarization. setOperationAction(ISD::LOAD, MVT::v4f16, Custom); setOperationAction(ISD::STORE, MVT::v4f16, Custom); } } if (!Subtarget.useSoftFloat() && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) { addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass); // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT. // Set the operation action Custom to do the customization later. setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom); for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { setF16Action(VT, Expand); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32); setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32); } setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom); addLegalFPImmediate(APFloat::getZero(APFloat::BFloat())); } if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) { addRegisterClass(MVT::v32bf16, &X86::VR512RegClass); setF16Action(MVT::v32bf16, Expand); for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32); setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); if (Subtarget.hasBWI()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } if (Subtarget.hasFP16()) { // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom); // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom); // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom); // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) { addRegisterClass(MVT::x86amx, &X86::TILERegClass); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); // Support carry in as value rather than glue. setOperationAction(ISD::UADDO_CARRY, VT, Custom); setOperationAction(ISD::USUBO_CARRY, VT, Custom); setOperationAction(ISD::SETCCCARRY, VT, Custom); setOperationAction(ISD::SADDO_CARRY, VT, Custom); setOperationAction(ISD::SSUBO_CARRY, VT, Custom); } // Combine sin / cos into _sincos_stret if it is available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); } // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) // clang-format off for (ISD::NodeType Op : {ISD::FACOS, ISD::STRICT_FACOS, ISD::FASIN, ISD::STRICT_FASIN, ISD::FATAN, ISD::STRICT_FATAN, ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FCOSH, ISD::STRICT_FCOSH, ISD::FEXP, ISD::STRICT_FEXP, ISD::FFLOOR, ISD::STRICT_FFLOOR, ISD::FREM, ISD::STRICT_FREM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, ISD::FSIN, ISD::STRICT_FSIN, ISD::FSINH, ISD::STRICT_FSINH, ISD::FTAN, ISD::STRICT_FTAN, ISD::FTANH, ISD::STRICT_FTANH}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); // clang-format on // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has // it, but it's just a wrapper around ldexp. if (Subtarget.isOSWindows()) { for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, ISD::SCALAR_TO_VECTOR, ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::BITCAST, ISD::VSELECT, ISD::SELECT, ISD::SHL, ISD::SRA, ISD::SRL, ISD::OR, ISD::AND, ISD::AVGCEILS, ISD::AVGCEILU, ISD::AVGFLOORS, ISD::AVGFLOORU, ISD::BITREVERSE, ISD::ADD, ISD::FADD, ISD::FSUB, ISD::FNEG, ISD::FMA, ISD::STRICT_FMA, ISD::FMINNUM, ISD::FMAXNUM, ISD::SUB, ISD::LOAD, ISD::LRINT, ISD::LLRINT, ISD::MLOAD, ISD::STORE, ISD::MSTORE, ISD::TRUNCATE, ISD::ZERO_EXTEND, ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG, ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::SETCC, ISD::MUL, ISD::XOR, ISD::MSCATTER, ISD::MGATHER, ISD::FP16_TO_FP, ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND, ISD::FP_ROUND, ISD::STRICT_FP_ROUND}); computeRegisterProperties(Subtarget.getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; // TODO: These control memcmp expansion in CGP and could be raised higher, but // that needs to benchmarked and balanced with the potential use of vector // load/store types (PR33329, PR33914). MaxLoadsPerMemcmp = 2; MaxLoadsPerMemcmpOptSize = 2; // Default loop alignment, which can be overridden by -align-loops. setPrefLoopAlignment(Align(16)); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); // Default to having -disable-strictnode-mutation on IsStrictFPEnabled = true; } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { EVT PtrTy = getPointerTy(DAG.getDataLayout()); unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); return SDValue(Node, 0); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(MVT VT) const { if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse) { if (!AssumeSingleUse && !Op.hasOneUse()) return false; if (!ISD::isNormalLoad(Op.getNode())) return false; // If this is an unaligned vector, make sure the target supports folding it. auto *Ld = cast(Op.getNode()); if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16)) return false; // TODO: If this is a non-temporal load and the target has an instruction // for it, it should not be folded. See "useNonTemporalLoad()". return true; } bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse) { assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory"); if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse)) return false; // We can not replace a wide volatile load with a broadcast-from-memory, // because that would narrow the load, which isn't legal for volatiles. auto *Ld = cast(Op.getNode()); return !Ld->isVolatile() || Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits(); } bool X86::mayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } bool X86::mayFoldIntoZeroExtend(SDValue Op) { if (Op.hasOneUse()) { unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); return (ISD::ZERO_EXTEND == Opcode); } return false; } static bool isLogicOp(unsigned Opcode) { // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage. return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode; } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::VALIGN: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: case X86ISD::MOVHLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::MOVSH: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VBROADCAST: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VPERMIL2: case X86ISD::VPERMI: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: return true; } } static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: return true; // 'Faux' Target Shuffles. case ISD::OR: case ISD::AND: case X86ISD::ANDNP: return true; } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM, bool HasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!HasSymbolicDisplacement) return true; // We can fold large offsets in the large code model because we always use // 64-bit offsets. if (CM == CodeModel::Large) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (CM == CodeModel::Kernel) return Offset >= 0; // For other non-large code models we assume that latest small object is 16MB // before end of 31 bits boundary. We may also accept pretty large negative // constants knowing that all objects are in the positive half of address // space. return Offset < 16 * 1024 * 1024; } /// Return true if the condition is an signed comparison operation. static bool isX86CCSigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: case X86::COND_NE: case X86::COND_B: case X86::COND_A: case X86::COND_BE: case X86::COND_AE: return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return true; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { // clang-format off default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; // clang-format on } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) { // X >= 0 -> X == 0, jump on !sign. return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { // clang-format off default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; // clang-format on } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) { return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() || VT.is512BitVector(); } bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { Info.flags = MachineMemOperand::MONone; Info.offset = 0; const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) { switch (Intrinsic) { case Intrinsic::x86_aesenc128kl: case Intrinsic::x86_aesdec128kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(1); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; return true; case Intrinsic::x86_aesenc256kl: case Intrinsic::x86_aesdec256kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(1); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; return true; case Intrinsic::x86_aesencwide128kl: case Intrinsic::x86_aesdecwide128kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; return true; case Intrinsic::x86_aesencwide256kl: case Intrinsic::x86_aesdecwide256kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; return true; case Intrinsic::x86_cmpccxadd32: case Intrinsic::x86_cmpccxadd64: case Intrinsic::x86_atomic_bts: case Intrinsic::x86_atomic_btc: case Intrinsic::x86_atomic_btr: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getType()->getScalarSizeInBits(); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); Info.align = Align(Size); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); Info.align = Align(Size); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::x86_aadd32: case Intrinsic::x86_aadd64: case Intrinsic::x86_aand32: case Intrinsic::x86_aand64: case Intrinsic::x86_aor32: case Intrinsic::x86_aor64: case Intrinsic::x86_axor32: case Intrinsic::x86_axor64: case Intrinsic::x86_atomic_add_cc: case Intrinsic::x86_atomic_sub_cc: case Intrinsic::x86_atomic_or_cc: case Intrinsic::x86_atomic_and_cc: case Intrinsic::x86_atomic_xor_cc: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); Info.align = Align(Size); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } } return false; } switch (IntrData->Type) { case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; if (IntrData->Type == TRUNCATE_TO_MEM_VI8) ScalarVT = MVT::i8; else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) ScalarVT = MVT::i16; else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; break; } case GATHER: case GATHER_AVX2: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; break; } case SCATTER: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; break; } default: return false; } return true; } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { for (const APFloat &FPImm : LegalFPImmediates) if (Imm.bitwiseIsEqual(FPImm)) return true; return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { assert(cast(Load)->isSimple() && "illegal to narrow"); // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; // If this is an (1) AVX vector load with (2) multiple uses and (3) all of // those uses are extracted directly into a store, then the extract + store // can be store-folded. Therefore, it's probably not worth splitting the load. EVT VT = Load->getValueType(0); if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { // Skip uses of the chain value. Result 0 of the node is the load value. if (UI.getUse().getResNo() != 0) continue; // If this use is not an extract + store, it's probably worth splitting. if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || UI->use_begin()->getOpcode() != ISD::STORE) return true; } // All non-chain uses are extract + store. return false; } return true; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { // If we are using XMM registers in the ABI and the condition of the select is // a floating-point compare and we have blendv or conditional move, then it is // cheaper to select instead of doing a cross-register move and creating a // load that depends on the compare result. bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); } bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { // TODO: It might be a win to ease or lift this restriction, but the generic // folds in DAGCombiner conflict with vector folds for an AVX512 target. if (VT.isVector() && Subtarget.hasAVX512()) return false; return true; } bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; // Find the type this will be legalized too. Otherwise we might prematurely // convert this to shl+add/sub and then still have to type legalize those ops. // Another choice would be to defer the decision for illegal types until // after type legalization. But constant splat vectors of i64 can't make it // through type legalization on 32-bit targets so we would need to special // case vXi64. while (getTypeAction(Context, VT) != TypeLegal) VT = getTypeToTransformTo(Context, VT); // If vector multiply is legal, assume that's faster than shl + add/sub. // Multiply is a complex op with higher latency and lower throughput in // most implementations, sub-vXi32 vector multiplies are always fast, // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64) // is always going to be slow. unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 && (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow())) return false; // shl+add, shl+sub, shl+add+neg return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() || (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; // Mask vectors support all subregister combinations and operations that // extract half of vector. if (ResVT.getVectorElementType() == MVT::i1) return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && (Index == ResVT.getVectorNumElements())); return (Index % ResVT.getVectorNumElements()) == 0; } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { unsigned Opc = VecOp.getOpcode(); // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? if (Opc >= ISD::BUILTIN_OP_END) return false; // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, bool) const { // TODO: Allow vectors? if (VT.isVector()) return false; return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { // Speculate cttz only if we can directly use TZCNT or can promote to i32. return Subtarget.hasBMI() || (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32); } bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget.hasLZCNT(); } bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more // expensive than a straight movsd. On the other hand, it's important to // shrink long double fp constant since fldt is very slow. return !Subtarget.hasSSE2() || VT == MVT::f80; } bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && Subtarget.hasSSE2()) || (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16; } bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; // If both types are legal vectors, it's always ok to convert them. if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) return true; return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat); if (NoFloat) { unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } // Make sure we don't merge greater than our preferred vector // width. if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) return false; return true; } bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { return true; } bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { EVT VT = Y.getValueType(); if (VT.isVector()) return false; if (!Subtarget.hasBMI()) return false; // There are only 32-bit and 64-bit forms for 'andn'. if (VT != MVT::i32 && VT != MVT::i64) return false; return !isa(Y) || cast(Y)->isOpaque(); } bool X86TargetLowering::hasAndNot(SDValue Y) const { EVT VT = Y.getValueType(); if (!VT.isVector()) return hasAndNotCompare(Y); // Vector. if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) return false; if (VT == MVT::v4i32) return true; return Subtarget.hasSSE2(); } bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { return X.getValueType().isScalarInteger(); // 'bt' } bool X86TargetLowering:: shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const { // Does baseline recommend not to perform the fold by default? if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) return false; // For scalars this transform is always beneficial. if (X.getValueType().isScalarInteger()) return true; // If all the shift amounts are identical, then transform is beneficial even // with rudimentary SSE2 shifts. if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) return true; // If we have AVX2 with it's powerful shift operations, then it's also good. if (Subtarget.hasAVX2()) return true; // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. return NewShiftOpcode == ISD::SHL; } unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand( EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional &AndMask) const { if (!VT.isInteger()) return ShiftOpc; bool PreferRotate = false; if (VT.isVector()) { // For vectors, if we have rotate instruction support, then its definetly // best. Otherwise its not clear what the best so just don't make changed. PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64); } else { // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer // rotate unless we have a zext mask+shr. PreferRotate = Subtarget.hasBMI2(); if (!PreferRotate) { unsigned MaskBits = VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue(); PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32); } } if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) { assert(AndMask.has_value() && "Null andmask when querying about shift+and"); if (PreferRotate && MayTransformRotate) return ISD::ROTL; // If vector we don't really get much benefit swapping around constants. // Maybe we could check if the DAG has the flipped node already in the // future. if (VT.isVector()) return ShiftOpc; // See if the beneficial to swap shift type. if (ShiftOpc == ISD::SHL) { // If the current setup has imm64 mask, then inverse will have // at least imm32 mask (or be zext i32 -> i64). if (VT == MVT::i64) return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL : ShiftOpc; // We can only benefit if req at least 7-bit for the mask. We // don't want to replace shl of 1,2,3 as they can be implemented // with lea/add. return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc; } if (VT == MVT::i64) // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is // extremely efficient. return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc; // Keep small shifts as shl so we can generate add/lea. return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc; } // We prefer rotate for vectors of if we won't get a zext mask with SRL // (PreferRotate will be set in the latter case). if (PreferRotate || !MayTransformRotate || VT.isVector()) return ShiftOpc; // Non-vector type and we have a zext mask with SRL. return ISD::SRL; } TargetLoweringBase::CondMergingParams X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const { using namespace llvm::PatternMatch; int BaseCost = BrMergingBaseCostThresh.getValue(); // With CCMP, branches can be merged in a more efficient way. if (BaseCost >= 0 && Subtarget.hasCCMP()) BaseCost += BrMergingCcmpBias; // a == b && a == c is a fast pattern on x86. ICmpInst::Predicate Pred; if (BaseCost >= 0 && Opc == Instruction::And && match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) && Pred == ICmpInst::ICMP_EQ && match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) && Pred == ICmpInst::ICMP_EQ) BaseCost += 1; return {BaseCost, BrMergingLikelyBias.getValue(), BrMergingUnlikelyBias.getValue()}; } bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const { return N->getOpcode() != ISD::FP_EXTEND; } bool X86TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); // TODO: Should we always create i64 masks? Or only folded immediates? EVT VT = N->getValueType(0); if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { // Only fold if the shift values are equal - so it folds to AND. // TODO - we should fold if either is a non-uniform vector but we don't do // the fold for non-splats yet. return N->getOperand(1) == N->getOperand(0).getOperand(1); } return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); } bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. if (VT.isVector()) return false; // 64-bit shifts on 32-bit targets produce really bad bloated code. if (VT == MVT::i64 && !Subtarget.is64Bit()) return false; return true; } TargetLowering::ShiftLegalizationStrategy X86TargetLowering::preferredShiftLegalizationStrategy( SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const { if (DAG.getMachineFunction().getFunction().hasMinSize() && !Subtarget.isOSWindows()) return ShiftLegalizationStrategy::LowerToLibcall; return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, ExpansionFactor); } bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { // Any legal vector type can be splatted more efficiently than // loading/spilling from memory. return isTypeLegal(VT); } MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) return VT; // PMOVMSKB can handle this. if (NumBits == 128 && isTypeLegal(MVT::v16i8)) return MVT::v16i8; // VPMOVMSKB can handle this. if (NumBits == 256 && isTypeLegal(MVT::v32i8)) return MVT::v32i8; // TODO: Allow 64-bit type for 32-bit target. // TODO: 512-bit types should be allowed, but make sure that those // cases are handled in combineVectorSizedSetCCEquality(). return MVT::INVALID_SIMPLE_VALUE_TYPE; } /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); } /// Return true if every element in Mask is the undef sentinel value or equal to /// the specified value. static bool isUndefOrEqual(ArrayRef Mask, int CmpVal) { return llvm::all_of(Mask, [CmpVal](int M) { return (M == SM_SentinelUndef) || (M == CmpVal); }); } /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value or equal to the specified value. static bool isUndefOrEqualInRange(ArrayRef Mask, int CmpVal, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); }); } /// Val is either the undef or zero sentinel value. static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), [](int M) { return M == SM_SentinelUndef; }); } /// Return true if the mask creates a vector whose lower half is undefined. static bool isUndefLowerHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, 0, NumElts / 2); } /// Return true if the mask creates a vector whose upper half is undefined. static bool isUndefUpperHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, NumElts / 2, NumElts / 2); } /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); } /// Return true if the value of any element in Mask falls within the specified /// range (L, H]. static bool isAnyInRange(ArrayRef Mask, int Low, int Hi) { return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); } /// Return true if the value of any element in Mask is the zero sentinel value. static bool isAnyZero(ArrayRef Mask) { return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. static bool isUndefOrInRange(ArrayRef Mask, int Low, int Hi) { return llvm::all_of( Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); } /// Return true if Val is undef, zero or if its value falls within the /// specified range (L, H]. static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { return isUndefOrZero(Val) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef Mask, int Low, int Hi) { return llvm::all_of( Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); } /// Return true if every element in Mask, is an in-place blend/select mask or is /// undef. LLVM_ATTRIBUTE_UNUSED static bool isBlendOrUndef(ArrayRef Mask) { unsigned NumElts = Mask.size(); for (auto [I, M] : enumerate(Mask)) if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos + Size, falls within the specified /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero); } /// Return true if every element of a single input is referenced by the shuffle /// mask. i.e. it just permutes them all. static bool isCompletePermute(ArrayRef Mask) { unsigned NumElts = Mask.size(); APInt DemandedElts = APInt::getZero(NumElts); for (int M : Mask) if (isInRange(M, 0, NumElts)) DemandedElts.setBit(M); return DemandedElts.isAllOnes(); } /// Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { int M0 = Mask[i]; int M1 = Mask[i + 1]; // If both elements are undef, its trivial. if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { WidenedMask[i / 2] = M1 / 2; continue; } if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { WidenedMask[i / 2] = M0 / 2; continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } static bool canWidenShuffleElements(ArrayRef Mask, const APInt &Zeroable, bool V2IsZero, SmallVectorImpl &WidenedMask) { // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. SmallVector ZeroableMask(Mask); if (V2IsZero) { assert(!Zeroable.isZero() && "V2's non-undef elements are used?!"); for (int i = 0, Size = Mask.size(); i != Size; ++i) if (Mask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef Mask) { SmallVector WidenedMask; return canWidenShuffleElements(Mask, WidenedMask); } // Attempt to narrow/widen shuffle mask until it matches the target number of // elements. static bool scaleShuffleElements(ArrayRef Mask, unsigned NumDstElts, SmallVectorImpl &ScaledMask) { unsigned NumSrcElts = Mask.size(); assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"); // Narrowing is guaranteed to work. if (NumDstElts >= NumSrcElts) { int Scale = NumDstElts / NumSrcElts; llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask); return true; } // We have to repeat the widening until we reach the target size, but we can // split out the first widening as it sets up ScaledMask for us. if (canWidenShuffleElements(Mask, ScaledMask)) { while (ScaledMask.size() > NumDstElts) { SmallVector WidenedMask; if (!canWidenShuffleElements(ScaledMask, WidenedMask)) return false; ScaledMask = std::move(WidenedMask); } return true; } return false; } static bool canScaleShuffleElements(ArrayRef Mask, unsigned NumDstElts) { SmallVector ScaledMask; return scaleShuffleElements(Mask, NumDstElts, ScaledMask); } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants. // Use an UNDEF node if MaskElt == -1. // Split 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } static SDValue getConstVector(ArrayRef Bits, const APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); } else if (EltVT == MVT::f32) { APFloat FV(APFloat::IEEEsingle(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else if (EltVT == MVT::f64) { APFloat FV(APFloat::IEEEdouble(), V); Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); } else { Ops.push_back(DAG.getConstant(V, dl, EltVT)); } } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); return DAG.getBitcast(VT, ConstsNode); } static SDValue getConstVector(ArrayRef Bits, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { APInt Undefs = APInt::getZero(Bits.size()); return getConstVector(Bits, Undefs, VT, DAG, dl); } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"); // Try to build SSE/AVX zero vectors as bitcasted to their dest // type. This ensures they get CSE'd. But if the integer type is not // available, use a floating-point +0.0 instead. SDValue Vec; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); } else if (VT.isFloatingPoint() && TLI.isTypeLegal(VT.getVectorElementType())) { Vec = DAG.getConstantFP(+0.0, dl, VT); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); Vec = DAG.getConstant(0, dl, VT); } else { unsigned Num32BitElts = VT.getSizeInBits() / 32; Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); } return DAG.getBitcast(VT, Vec); } // Helper to determine if the ops are all the extracted subvectors come from a // single source. If we allow commute they don't have to be in order (Lo/Hi). static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) { if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || LHS.getValueType() != RHS.getValueType() || LHS.getOperand(0) != RHS.getOperand(0)) return SDValue(); SDValue Src = LHS.getOperand(0); if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2)) return SDValue(); unsigned NumElts = LHS.getValueType().getVectorNumElements(); if ((LHS.getConstantOperandAPInt(1) == 0 && RHS.getConstantOperandAPInt(1) == NumElts) || (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 && LHS.getConstantOperandAPInt(1) == NumElts)) return Src; return SDValue(); } static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits() / vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements() / Factor); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(ResultVT, dl, Vec->ops().slice(IdxVal, ElemsPerChunk)); // Check if we're extracting the upper undef of a widening pattern. if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() && Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal && isNullConstant(Vec.getOperand(2))) return DAG.getUNDEF(ResultVT); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.isUndef()) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueSizeInBits().getFixedValue() <= VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && "Unsupported vector widening type"); SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) : DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec, DAG.getIntPtrConstant(0, dl)); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl, unsigned WideSizeInBits) { assert(Vec.getValueSizeInBits() <= WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"); unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); MVT SVT = Vec.getSimpleValueType().getScalarType(); MVT VT = MVT::getVectorVT(SVT, WideNumElts); return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } /// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT /// and bitcast with integer types. static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) { assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector"); unsigned NumElts = VT.getVectorNumElements(); if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; return VT; } /// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and /// bitcast with integer types. static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget); return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } // Helper function to collect subvector ops that are concatenated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops, SelectionDAG &DAG) { assert(Ops.empty() && "Expected an empty ops vector"); if (N->getOpcode() == ISD::CONCAT_VECTORS) { Ops.append(N->op_begin(), N->op_end()); return true; } if (N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Src = N->getOperand(0); SDValue Sub = N->getOperand(1); const APInt &Idx = N->getConstantOperandAPInt(2); EVT VT = Src.getValueType(); EVT SubVT = Sub.getValueType(); if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) { // insert_subvector(undef, x, lo) if (Idx == 0 && Src.isUndef()) { Ops.push_back(Sub); Ops.push_back(DAG.getUNDEF(SubVT)); return true; } if (Idx == (VT.getVectorNumElements() / 2)) { // insert_subvector(insert_subvector(undef, x, lo), y, hi) if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(1).getValueType() == SubVT && isNullConstant(Src.getOperand(2))) { // Attempt to recurse into inner (matching) concats. SDValue Lo = Src.getOperand(1); SDValue Hi = Sub; SmallVector LoOps, HiOps; if (collectConcatOps(Lo.getNode(), LoOps, DAG) && collectConcatOps(Hi.getNode(), HiOps, DAG) && LoOps.size() == HiOps.size()) { Ops.append(LoOps); Ops.append(HiOps); return true; } Ops.push_back(Lo); Ops.push_back(Hi); return true; } // insert_subvector(x, extract_subvector(x, lo), hi) if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { Ops.append(2, Sub); return true; } // insert_subvector(undef, x, hi) if (Src.isUndef()) { Ops.push_back(DAG.getUNDEF(SubVT)); Ops.push_back(Sub); return true; } } } } return false; } // Helper to check if \p V can be split into subvectors and the upper subvectors // are all undef. In which case return the lower subvector. static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG) { SmallVector SubOps; if (!collectConcatOps(V.getNode(), SubOps, DAG)) return SDValue(); unsigned NumSubOps = SubOps.size(); unsigned HalfNumSubOps = NumSubOps / 2; assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors"); ArrayRef UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end()); if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); })) return SDValue(); EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); ArrayRef LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps); return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps); } // Helper to check if we can access all the constituent subvectors without any // extract ops. static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG) { SmallVector Ops; return collectConcatOps(N, Ops, DAG); } static std::pair splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { EVT VT = Op.getValueType(); unsigned NumElems = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && "Can't split odd sized vector"); // If this is a splat value (with no-undefs) then use the lower subvector, // which should be a free extraction. SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2); if (DAG.isSplatValue(Op, /*AllowUndefs*/ false)) return std::make_pair(Lo, Lo); SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2); return std::make_pair(Lo, Hi); } /// Break an operation into 2 half sized ops and then concatenate the results. static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { unsigned NumOps = Op.getNumOperands(); EVT VT = Op.getValueType(); // Extract the LHS Lo/Hi vectors SmallVector LoOps(NumOps, SDValue()); SmallVector HiOps(NumOps, SDValue()); for (unsigned I = 0; I != NumOps; ++I) { SDValue SrcOp = Op.getOperand(I); if (!SrcOp.getValueType().isVector()) { LoOps[I] = HiOps[I] = SrcOp; continue; } std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl); } EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps), DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps)); } /// Break an unary integer operation into 2 half sized ops and then /// concatenate the result back. static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { // Make sure we only try to split 256/512-bit types to avoid creating // narrow vectors. [[maybe_unused]] EVT VT = Op.getValueType(); assert((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); assert(Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && "Unexpected VTs!"); return splitVectorOp(Op, DAG, dl); } /// Break a binary integer operation into 2 half sized ops and then /// concatenate the result back. static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { // Assert that all the types match. [[maybe_unused]] EVT VT = Op.getValueType(); assert(Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && "Unexpected VTs!"); assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); return splitVectorOp(Op, DAG, dl); } // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for // deciding if/how to split Ops. Ops elements do *not* have to be of type VT. // The argument Builder is a function that will be applied on each split part: // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef) template SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef Ops, F Builder, bool CheckBWI = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; if ((CheckBWI && Subtarget.useBWIRegs()) || (!CheckBWI && Subtarget.useAVX512Regs())) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); } } else if (Subtarget.hasAVX2()) { if (VT.getSizeInBits() > 256) { NumSubs = VT.getSizeInBits() / 256; assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"); } } else { if (VT.getSizeInBits() > 128) { NumSubs = VT.getSizeInBits() / 128; assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"); } } if (NumSubs == 1) return Builder(DAG, DL, Ops); SmallVector Subs; for (unsigned i = 0; i != NumSubs; ++i) { SmallVector SubOps; for (SDValue Op : Ops) { EVT OpVT = Op.getValueType(); unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs; unsigned SizeSub = OpVT.getSizeInBits() / NumSubs; SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub)); } Subs.push_back(Builder(DAG, DL, SubOps)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX // targets. static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(Subtarget.hasAVX512() && "AVX512 target expected"); MVT SVT = VT.getScalarType(); // If we have a 32/64 splatted constant, splat it to DstTy to // encourage a foldable broadcast'd operand. auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) { unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits(); // AVX512 broadcasts 32/64-bit operands. // TODO: Support float once getAVX512Node is used by fp-ops. if (!OpVT.isInteger() || OpEltSizeInBits < 32 || !DAG.getTargetLoweringInfo().isTypeLegal(SVT)) return SDValue(); // If we're not widening, don't bother if we're not bitcasting. if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST) return SDValue(); if (auto *BV = dyn_cast(peekThroughBitcasts(Op))) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, OpEltSizeInBits) && !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits) return DAG.getConstant(SplatValue, DL, DstVT); } return SDValue(); }; bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector()); MVT DstVT = VT; if (Widen) DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits()); // Canonicalize src operands. SmallVector SrcOps(Ops.begin(), Ops.end()); for (SDValue &Op : SrcOps) { MVT OpVT = Op.getSimpleValueType(); // Just pass through scalar operands. if (!OpVT.isVector()) continue; assert(OpVT == VT && "Vector type mismatch"); if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) { Op = BroadcastOp; continue; } // Just widen the subvector by inserting into an undef wide vector. if (Widen) Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512); } SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps); // Perform the 512-bit op then extract the bottom subvector. if (Widen) Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); return Res; } /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); unsigned IdxVal = Op.getConstantOperandVal(2); // Inserting undef is a nop. We can just return the original vector. if (SubVec.isUndef()) return Vec; if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); // Extend to natively supported kshift. MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget); // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts // if necessary. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); SDValue Undef = DAG.getUNDEF(WideOpVT); if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { assert(IdxVal != 0 && "Unexpected index"); // If upper elements of Vec are known undef, then just shift into place. if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems), [](SDValue V) { return V.isUndef(); })) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); } else { NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); } return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to optimize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Inserting into the middle is more complicated. NumElems = WideOpVT.getVectorNumElements(); // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; // Do an optimization for the most frequently used types. if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); Mask0.flipAllBits(); SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Clear the upper bits of the subvector and move it to its insert position. SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); // Isolate the bits below the insertion point. unsigned LowShift = NumElems - IdxVal; SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, DAG.getTargetConstant(LowShift, dl, MVT::i8)); Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, DAG.getTargetConstant(LowShift, dl, MVT::i8)); // Isolate the bits after the last inserted bit. unsigned HighShift = IdxVal + SubVecNumElems; SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getTargetConstant(HighShift, dl, MVT::i8)); High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, DAG.getTargetConstant(HighShift, dl, MVT::i8)); // Now OR all 3 pieces together. Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl) { assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch"); EVT SubVT = V1.getValueType(); EVT SubSVT = SubVT.getScalarType(); unsigned SubNumElts = SubVT.getVectorNumElements(); unsigned SubVectorWidth = SubVT.getSizeInBits(); EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); unsigned NumElts = VT.getSizeInBits() / 32; SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. if (InVT.getSizeInBits() > 128) { assert(VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"); unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); InVT = In.getValueType(); } if (VT.getVectorNumElements() != InVT.getVectorNumElements()) Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, In); } // Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG) { LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask); RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS); return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); } void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl &Mask, bool Lo, bool Unary) { assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"); assert(Mask.empty() && "Expected an empty shuffle mask vector"); int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); for (int i = 0; i < NumElts; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; int Pos = (i % NumEltsInLane) / 2 + LaneStart; Pos += (Unary ? 0 : NumElts * (i % 2)); Pos += (Lo ? 0 : NumEltsInLane / 2); Mask.push_back(Pos); } } /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation /// imposed by AVX and specific to the unary pattern. Example: /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); int NumElts = VT.getVectorNumElements(); for (int i = 0; i < NumElts; ++i) { int Pos = i / 2; Pos += (Lo ? 0 : NumElts / 2); Mask.push_back(Pos); } } // Attempt to constant fold, else just create a VECTOR_SHUFFLE. static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef Mask) { if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) && (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) { SmallVector Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType())); for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) { int M = Mask[I]; if (M < 0) continue; SDValue V = (M < NumElts) ? V1 : V2; if (V.isUndef()) continue; Ops[I] = V.getOperand(M % NumElts); } return DAG.getBuildVector(VT, dl, Ops); } return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); } /// Returns a node that packs the LHS + RHS nodes together at half width. /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half. /// TODO: Add subvector splitting if/when we have a need for it. static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf = false) { MVT OpVT = LHS.getSimpleValueType(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8; assert(OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"); assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && "Unexpected PACK result type"); // Rely on vector shuffles for vXi64 -> vXi32 packing. if (EltSizeInBits == 32) { SmallVector PackMask; int Offset = PackHiHalf ? 1 : 0; int NumElts = VT.getVectorNumElements(); for (int I = 0; I != NumElts; I += 4) { PackMask.push_back(I + Offset); PackMask.push_back(I + Offset + 2); PackMask.push_back(I + Offset + NumElts); PackMask.push_back(I + Offset + NumElts + 2); } return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS), DAG.getBitcast(VT, RHS), PackMask); } // See if we already have sufficient leading bits for PACKSS/PACKUS. if (!PackHiHalf) { if (UsePackUS && DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits && DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits) return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits && DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits) return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); } // Fallback to sign/zero extending the requested half and pack. SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8); if (UsePackUS) { if (PackHiHalf) { LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt); RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt); } else { SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT); LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask); RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask); }; return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); }; if (!PackHiHalf) { LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt); RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt); } LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt); RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt); return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); int NumElems = VT.getVectorNumElements(); SmallVector MaskVec(NumElems); for (int i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec[i] = (i == Idx) ? NumElems : i; return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } static ConstantPoolSDNode *getTargetConstantPoolFromBasePtr(SDValue Ptr) { if (Ptr.getOpcode() == X86ISD::Wrapper || Ptr.getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr.getOperand(0); return dyn_cast(Ptr); } // TODO: Add support for non-zero offsets. static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) { ConstantPoolSDNode *CNode = getTargetConstantPoolFromBasePtr(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) return nullptr; return CNode->getConstVal(); } static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { if (!Load || !ISD::isNormalLoad(Load)) return nullptr; return getTargetConstantFromBasePtr(Load->getBasePtr()); } static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); return getTargetConstantFromNode(dyn_cast(Op)); } const Constant * X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { assert(LD && "Unexpected null LoadSDNode"); return getTargetConstantFromNode(LD); } // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl &EltBits, bool AllowWholeUndefs = true, bool AllowPartialUndefs = false) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"); // Don't split if we don't allow undef bits. bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; if (UndefSrcElts.getBoolValue() && !AllowUndefs) return false; // If we're already the right size, don't bother bitcasting. if (NumSrcElts == NumElts) { UndefElts = UndefSrcElts; EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); return true; } // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); for (unsigned i = 0; i != NumSrcElts; ++i) { unsigned BitOffset = i * SrcEltSizeInBits; if (UndefSrcElts[i]) UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); MaskBits.insertBits(SrcEltBits[i], BitOffset); } // Split the undef/constant single bitset data into the target elements. UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { unsigned BitOffset = i * EltSizeInBits; APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnes()) { if (!AllowWholeUndefs) return false; UndefElts.setBit(i); continue; } // If only some bits are UNDEF then treat them as zero (or bail if not // supported). if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; // Collect constant bits and insert into mask/undef bit masks. auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, unsigned UndefBitIndex) { if (!Cst) return false; if (isa(Cst)) { Undefs.setBit(UndefBitIndex); return true; } if (auto *CInt = dyn_cast(Cst)) { Mask = CInt->getValue(); return true; } if (auto *CFP = dyn_cast(Cst)) { Mask = CFP->getValueAPF().bitcastToAPInt(); return true; } if (auto *CDS = dyn_cast(Cst)) { Type *Ty = CDS->getType(); Mask = APInt::getZero(Ty->getPrimitiveSizeInBits()); Type *EltTy = CDS->getElementType(); bool IsInteger = EltTy->isIntegerTy(); bool IsFP = EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy(); if (!IsInteger && !IsFP) return false; unsigned EltBits = EltTy->getPrimitiveSizeInBits(); for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) if (IsInteger) Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits); else Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(), I * EltBits); return true; } return false; }; // Handle UNDEFs. if (Op.isUndef()) { APInt UndefSrcElts = APInt::getAllOnes(NumElts); SmallVector SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract scalar constant bits. if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getZero(1); SmallVector SrcEltBits(1, Cst->getAPIntValue()); return CastBitData(UndefSrcElts, SrcEltBits); } if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getZero(1); APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); SmallVector SrcEltBits(1, RawBits); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from build vector. if (auto *BV = dyn_cast(Op)) { BitVector Undefs; SmallVector SrcEltBits; unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) { APInt UndefSrcElts = APInt::getZero(SrcEltBits.size()); for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I) if (Undefs[I]) UndefSrcElts.setBit(I); return CastBitData(UndefSrcElts, SrcEltBits); } } // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0) return false; unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; if ((SizeInBits % SrcEltSizeInBits) != 0) return false; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0; i != NumSrcElts; ++i) if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], UndefSrcElts, i)) return false; return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && EltSizeInBits <= VT.getScalarSizeInBits()) { auto *MemIntr = cast(Op); if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits()) return false; SDValue Ptr = MemIntr->getBasePtr(); if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { if (UndefSrcElts[0]) UndefSrcElts.setBits(0, NumSrcElts); if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits) SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits); SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); return CastBitData(UndefSrcElts, SrcEltBits); } } } // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { auto *MemIntr = cast(Op); SDValue Ptr = MemIntr->getBasePtr(); // The source constant may be larger than the subvector broadcast, // ensure we extract the correct subvector constants. if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) { Type *CstTy = Cst->getType(); unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits(); if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 || (SizeInBits % SubVecSizeInBits) != 0) return false; unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits; unsigned NumSubVecs = SizeInBits / SubVecSizeInBits; APInt UndefSubElts(NumSubElts, 0); SmallVector SubEltBits(NumSubElts * NumSubVecs, APInt(CstEltSizeInBits, 0)); for (unsigned i = 0; i != NumSubElts; ++i) { if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i], UndefSubElts, i)) return false; for (unsigned j = 1; j != NumSubVecs; ++j) SubEltBits[i + (j * NumSubElts)] = SubEltBits[i]; } UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(), UndefSubElts); return CastBitData(UndefSubElts, SubEltBits); } } // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && isa(Op.getOperand(0).getOperand(0))) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits; const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0); SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits)); SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Insert constant bits from a base and sub vector sources. if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) { // If bitcasts to larger elements we might lose track of undefs - don't // allow any to be safe. unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits; APInt UndefSrcElts, UndefSubElts; SmallVector EltSrcBits, EltSubBits; if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits, UndefSubElts, EltSubBits, AllowWholeUndefs && AllowUndefs, AllowPartialUndefs && AllowUndefs) && getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits, UndefSrcElts, EltSrcBits, AllowWholeUndefs && AllowUndefs, AllowPartialUndefs && AllowUndefs)) { unsigned BaseIdx = Op.getConstantOperandVal(2); UndefSrcElts.insertBits(UndefSubElts, BaseIdx); for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) EltSrcBits[BaseIdx + i] = EltSubBits[i]; return CastBitData(UndefSrcElts, EltSrcBits); } } // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) { // TODO - support extract_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, EltBits, AllowWholeUndefs, AllowPartialUndefs)) { EVT SrcVT = Op.getOperand(0).getValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumSubElts = VT.getVectorNumElements(); unsigned BaseIdx = Op.getConstantOperandVal(1); UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx); if ((BaseIdx + NumSubElts) != NumSrcElts) EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end()); if (BaseIdx != 0) EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx); return true; } } // Extract constant bits from shuffle node sources. if (auto *SVN = dyn_cast(Op)) { // TODO - support shuffle through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; ArrayRef Mask = SVN->getMask(); if ((!AllowWholeUndefs || !AllowPartialUndefs) && llvm::any_of(Mask, [](int M) { return M < 0; })) return false; APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if (isAnyInRange(Mask, 0, NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts0, EltBits0, AllowWholeUndefs, AllowPartialUndefs)) return false; if (isAnyInRange(Mask, NumElts, 2 * NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, UndefElts1, EltBits1, AllowWholeUndefs, AllowPartialUndefs)) return false; UndefElts = APInt::getZero(NumElts); for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; if (M < 0) { UndefElts.setBit(i); EltBits.push_back(APInt::getZero(EltSizeInBits)); } else if (M < (int)NumElts) { if (UndefElts0[M]) UndefElts.setBit(i); EltBits.push_back(EltBits0[M]); } else { if (UndefElts1[M - NumElts]) UndefElts.setBit(i); EltBits.push_back(EltBits1[M - NumElts]); } } return true; } return false; } namespace llvm { namespace X86 { bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode( Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits, /*AllowWholeUndefs*/ true, AllowPartialUndefs)) { int SplatIndex = -1; for (int i = 0, e = EltBits.size(); i != e; ++i) { if (UndefElts[i]) continue; if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { SplatIndex = -1; break; } SplatIndex = i; } if (0 <= SplatIndex) { SplatVal = EltBits[SplatIndex]; return true; } } return false; } } // namespace X86 } // namespace llvm static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask, APInt &UndefElts) { // Extract the raw target constant bits. SmallVector EltBits; if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) return false; // Insert the extracted elements into the mask. for (const APInt &Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; } // Match not(xor X, -1) -> X. // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1). // Match not(extract_subvector(xor X, -1)) -> extract_subvector(X). // Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y). static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { V = peekThroughBitcasts(V); if (V.getOpcode() == ISD::XOR && (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) || isAllOnesConstant(V.getOperand(1)))) return V.getOperand(0); if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), Not, V.getOperand(1)); } } if (V.getOpcode() == X86ISD::PCMPGT && !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) && !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) && V.getOperand(0).hasOneUse()) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(V.getOperand(0), V.getScalarValueSizeInBits(), UndefElts, EltBits)) { // Don't fold min_signed_value -> (min_signed_value - 1) bool MinSigned = false; for (APInt &Elt : EltBits) { MinSigned |= Elt.isMinSignedValue(); Elt -= 1; } if (!MinSigned) { SDLoc DL(V); MVT VT = V.getSimpleValueType(); return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1), getConstVector(EltBits, UndefElts, VT, DAG, DL)); } } } SmallVector CatOps; if (collectConcatOps(V.getNode(), CatOps, DAG)) { for (SDValue &CatOp : CatOps) { SDValue NotCat = IsNOT(CatOp, DAG); if (!NotCat) return SDValue(); CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); } return SDValue(); } /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. /// A multi-stage pack shuffle mask is created by specifying NumStages > 1. /// Note: This ignores saturation, so inputs must be checked first. static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Unary, unsigned NumStages = 1) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); unsigned Offset = Unary ? 0 : NumElts; unsigned Repetitions = 1u << (NumStages - 1); unsigned Increment = 1u << NumStages; assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction"); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Stage = 0; Stage != Repetitions; ++Stage) { for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) Mask.push_back(Elt + (Lane * NumEltsPerLane)); for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); } } } // Split the demanded elts of a PACKSS/PACKUS node between its operands. static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { int NumLanes = VT.getSizeInBits() / 128; int NumElts = DemandedElts.getBitWidth(); int NumInnerElts = NumElts / 2; int NumEltsPerLane = NumElts / NumLanes; int NumInnerEltsPerLane = NumInnerElts / NumLanes; DemandedLHS = APInt::getZero(NumInnerElts); DemandedRHS = APInt::getZero(NumInnerElts); // Map DemandedElts to the packed operands. for (int Lane = 0; Lane != NumLanes; ++Lane) { for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { int OuterIdx = (Lane * NumEltsPerLane) + Elt; int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; if (DemandedElts[OuterIdx]) DemandedLHS.setBit(InnerIdx); if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) DemandedRHS.setBit(InnerIdx); } } } // Split the demanded elts of a HADD/HSUB node between its operands. static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { getHorizDemandedEltsForFirstOperand(VT.getSizeInBits(), DemandedElts, DemandedLHS, DemandedRHS); DemandedLHS |= DemandedLHS << 1; DemandedRHS |= DemandedRHS << 1; } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. /// Sets \p IsUnary to true if only one source is used. Note that this will set /// IsUnary for shuffles which use a single input multiple times, and in those /// cases it will adjust the mask to only have indices within that single input. /// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); unsigned MaskEltSize = VT.getScalarSizeInBits(); SmallVector RawMask; APInt RawUndefs; uint64_t ImmN; assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); IsUnary = false; bool IsFakeUnary = false; switch (N.getOpcode()) { case X86ISD::BLENDI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeBLENDMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::SHUFP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::INSERTPS: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeINSERTPSMask(ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::EXTRQI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); if (isa(N.getOperand(1)) && isa(N.getOperand(2))) { int BitLen = N.getConstantOperandVal(1); int BitIdx = N.getConstantOperandVal(2); DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = true; } break; case X86ISD::INSERTQI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); if (isa(N.getOperand(2)) && isa(N.getOperand(3))) { int BitLen = N.getConstantOperandVal(2); int BitIdx = N.getConstantOperandVal(3); DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); } break; case X86ISD::UNPCKH: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKHMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::UNPCKL: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKLMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::MOVHLPS: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::MOVLHPS: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::VALIGN: assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeVALIGNMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); Ops.push_back(N.getOperand(1)); Ops.push_back(N.getOperand(0)); break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePALIGNRMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); Ops.push_back(N.getOperand(1)); Ops.push_back(N.getOperand(0)); break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSLLDQMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSRLDQMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFHW: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSHUFHWMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFLW: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSHUFLWMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeZeroMoveLowMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VBROADCAST: // We only decode broadcasts of same-sized vectors, peeking through to // extracted subvectors is likely to cause hasOneUse issues with // SimplifyDemandedBits etc. if (N.getOperand(0).getValueType() == VT) { DecodeVectorBroadcast(NumElems, Mask); IsUnary = true; break; } return false; case X86ISD::VPERMILPV: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N.getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::PSHUFB: { assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N.getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodePSHUFBMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeVPERMMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::MOVSH: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeVPERM2X128Mask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::SHUF128: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::MOVSLDUP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSLDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSHDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVDDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VPERMIL2: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); SDValue MaskNode = N.getOperand(2); SDValue CtrlNode = N.getOperand(3); if (ConstantSDNode *CtrlOp = dyn_cast(CtrlNode)) { unsigned CtrlImm = CtrlOp->getZExtValue(); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs, Mask); break; } } return false; } case X86ISD::VPPERM: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); SDValue MaskNode = N.getOperand(2); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodeVPPERMMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV: { assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. Ops.push_back(N.getOperand(1)); SDValue MaskNode = N.getOperand(0); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMVMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV3: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(2)); SDValue MaskNode = N.getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); break; } return false; } default: llvm_unreachable("unknown target shuffle node"); } // Empty mask indicates the decode failed. if (Mask.empty()) return false; // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero && isAnyZero(Mask)) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); // If we didn't already add operands in the opcode-specific code, default to // adding 1 or 2 operands starting at 0. if (Ops.empty()) { Ops.push_back(N.getOperand(0)); if (!IsUnary || IsFakeUnary) Ops.push_back(N.getOperand(1)); } return true; } // Wrapper for getTargetShuffleMask with InUnary; static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask) { bool IsUnary; return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary); } /// Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static void computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero) { int Size = Mask.size(); KnownUndef = KnownZero = APInt::getZero(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); int VectorSizeInBits = V1.getValueSizeInBits(); int ScalarSizeInBits = VectorSizeInBits / Size; assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0) { KnownUndef.setBit(i); continue; } if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { KnownZero.setBit(i); continue; } // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; M %= Size; // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. if (V.getOpcode() != ISD::BUILD_VECTOR) continue; // If the BUILD_VECTOR has fewer elements then the bitcasted portion of // the (larger) source element must be UNDEF/ZERO. if ((Size % V.getNumOperands()) == 0) { int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef()) KnownUndef.setBit(i); if (X86::isZeroNode(Op)) KnownZero.setBit(i); else if (ConstantSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getAPIntValue(); Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); if (Val == 0) KnownZero.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); if (Val == 0) KnownZero.setBit(i); } continue; } // If the BUILD_VECTOR has more elements then all the (smaller) source // elements must be UNDEF or ZERO. if ((V.getNumOperands() % Size) == 0) { int Scale = V->getNumOperands() / Size; bool AllUndef = true; bool AllZero = true; for (int j = 0; j < Scale; ++j) { SDValue Op = V.getOperand((M * Scale) + j); AllUndef &= Op.isUndef(); AllZero &= X86::isZeroNode(Op); } if (AllUndef) KnownUndef.setBit(i); if (AllZero) KnownZero.setBit(i); continue; } } } /// Decode a target shuffle mask and inputs and see if any values are /// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. /// FIXME: Merge this with computeZeroableShuffleElements? static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops, APInt &KnownUndef, APInt &KnownZero) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary)) return false; int Size = Mask.size(); SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; KnownUndef = KnownZero = APInt::getZero(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Size; // Extract known constant input data. APInt UndefSrcElts[2]; SmallVector SrcEltBits[2]; bool IsSrcConstant[2] = { getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], SrcEltBits[0], /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false), getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)}; for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. if (M < 0) { assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); if (SM_SentinelUndef == M) KnownUndef.setBit(i); if (SM_SentinelZero == M) KnownZero.setBit(i); continue; } // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; // We are referencing an UNDEF input. if (V.isUndef()) { KnownUndef.setBit(i); continue; } // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. // TODO: We currently only set UNDEF for integer types - floats use the same // registers as vectors and many of the scalar folded loads rely on the // SCALAR_TO_VECTOR pattern. if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && (Size % V.getValueType().getVectorNumElements()) == 0) { int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) KnownUndef.setBit(i); else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) KnownZero.setBit(i); continue; } // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF // base vectors. if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Vec = V.getOperand(0); int NumVecElts = Vec.getValueType().getVectorNumElements(); if (Vec.isUndef() && Size == NumVecElts) { int Idx = V.getConstantOperandVal(2); int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements(); if (M < Idx || (Idx + NumSubElts) <= M) KnownUndef.setBit(i); } continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) KnownUndef.setBit(i); else if (SrcEltBits[SrcIdx][M] == 0) KnownZero.setBit(i); } } assert(VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"); return true; } // Replace target shuffle mask elements with known undef/zero sentinels. static void resolveTargetShuffleFromZeroables(SmallVectorImpl &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros= true) { unsigned NumElts = Mask.size(); assert(KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); for (unsigned i = 0; i != NumElts; ++i) { if (KnownUndef[i]) Mask[i] = SM_SentinelUndef; else if (ResolveKnownZeros && KnownZero[i]) Mask[i] = SM_SentinelZero; } } // Extract target shuffle mask sentinel elements to known undef/zero bitmasks. static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl &Mask, APInt &KnownUndef, APInt &KnownZero) { unsigned NumElts = Mask.size(); KnownUndef = KnownZero = APInt::getZero(NumElts); for (unsigned i = 0; i != NumElts; ++i) { int M = Mask[i]; if (SM_SentinelUndef == M) KnownUndef.setBit(i); if (SM_SentinelZero == M) KnownZero.setBit(i); } } // Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask. static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, SDValue Cond, bool IsBLENDV = false) { EVT CondVT = Cond.getValueType(); unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); unsigned NumElts = CondVT.getVectorNumElements(); APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)) return false; Mask.resize(NumElts, SM_SentinelUndef); for (int i = 0; i != (int)NumElts; ++i) { Mask[i] = i; // Arbitrarily choose from the 2nd operand if the select condition element // is undef. // TODO: Can we do better by matching patterns such as even/odd? if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) || (IsBLENDV && EltBits[i].isNonNegative())) Mask[i] += NumElts; } return true; } // Forward declaration (for getFauxShuffleMask recursive check). static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. // TODO: Merge into getTargetShuffleInputs() static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl &Mask, SmallVectorImpl &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { Mask.clear(); Ops.clear(); MVT VT = N.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) return false; assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned NumSizeInBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; unsigned Opcode = N.getOpcode(); switch (Opcode) { case ISD::VECTOR_SHUFFLE: { // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here. ArrayRef ShuffleMask = cast(N)->getMask(); if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) { Mask.append(ShuffleMask.begin(), ShuffleMask.end()); Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } return false; } case ISD::AND: case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. APInt UndefElts; SmallVector EltBits; SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); bool IsAndN = (X86ISD::ANDNP == Opcode); uint64_t ZeroMask = IsAndN ? 255 : 0; if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits, /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) return false; // We can't assume an undef src element gives an undef dst - the other src // might be zero. assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask"); for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { const APInt &ByteBits = EltBits[i]; if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } Ops.push_back(IsAndN ? N1 : N0); return true; } case ISD::OR: { // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughBitcasts(N.getOperand(0)); SDValue N1 = peekThroughBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector SrcMask0, SrcMask1; SmallVector SrcInputs0, SrcInputs1; APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements()); APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements()); if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG, Depth + 1, true) || !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG, Depth + 1, true)) return false; size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (int i = 0; i != (int)MaskSize; ++i) { // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite // loops converting between OR and BLEND shuffles due to // canWidenShuffleElements merging away undef elements, meaning we // fail to recognise the OR as the undef element isn't known zero. if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) Mask.push_back(i); else if (Mask0[i] == SM_SentinelZero) Mask.push_back(i + MaskSize); else return false; } Ops.push_back(N0); Ops.push_back(N1); return true; } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); if (!N->isOnlyUserOf(Sub.getNode())) return false; SDValue SubBC = peekThroughBitcasts(Sub); uint64_t InsertIdx = N.getConstantOperandVal(2); // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) { uint64_t ExtractIdx = SubBC.getConstantOperandVal(1); SDValue SubBCSrc = SubBC.getOperand(0); unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements(); unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts); assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 && "Subvector valuetype mismatch"); InsertIdx *= (MaxElts / NumElts); ExtractIdx *= (MaxElts / NumSubSrcBCElts); NumSubElts *= (MaxElts / NumElts); bool SrcIsUndef = Src.isUndef(); for (int i = 0; i != (int)MaxElts; ++i) Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i); for (int i = 0; i != (int)NumSubElts; ++i) Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i; if (!SrcIsUndef) Ops.push_back(Src); Ops.push_back(SubBCSrc); return true; } // Handle CONCAT(SUB0, SUB1). // Limit this to vXi64 512-bit vector cases to make the most of AVX512 // cross lane shuffles. if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) && NumBitsPerElt == 64 && NumSizeInBits == 512 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(0).isUndef() && Src.getOperand(1).getValueType() == SubVT && Src.getConstantOperandVal(2) == 0) { for (int i = 0; i != (int)NumSubElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) Mask.push_back(i + NumElts); Ops.push_back(Src.getOperand(1)); Ops.push_back(Sub); return true; } // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector SubMask; SmallVector SubInputs; SDValue SubSrc = peekThroughOneUseBitcasts(Sub); EVT SubSrcVT = SubSrc.getValueType(); if (!SubSrcVT.isVector()) return false; APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements()); if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG, Depth + 1, ResolveKnownElts)) return false; // Subvector shuffle inputs must not be larger than the subvector. if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) { return SubVT.getFixedSizeInBits() < SubInput.getValueSizeInBits().getFixedValue(); })) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); if ((NumSubElts % SubMask.size()) == 0) { int Scale = NumSubElts / SubMask.size(); SmallVector ScaledSubMask; narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask); SubMask = ScaledSubMask; } else { int Scale = SubMask.size() / NumSubElts; NumSubElts = SubMask.size(); NumElts *= Scale; InsertIdx *= Scale; } } Ops.push_back(Src); Ops.append(SubInputs.begin(), SubInputs.end()); if (ISD::isBuildVectorAllZeros(Src.getNode())) Mask.append(NumElts, SM_SentinelZero); else for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } return true; } case X86ISD::PINSRB: case X86ISD::PINSRW: case ISD::SCALAR_TO_VECTOR: case ISD::INSERT_VECTOR_ELT: { // Match against a insert_vector_elt/scalar_to_vector of an extract from a // vector, for matching src/dst vector types. SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1); unsigned DstIdx = 0; if (Opcode != ISD::SCALAR_TO_VECTOR) { // Check we have an in-range constant insertion index. if (!isa(N.getOperand(2)) || N.getConstantOperandAPInt(2).uge(NumElts)) return false; DstIdx = N.getConstantOperandVal(2); // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern. if (X86::isZeroNode(Scl)) { Ops.push_back(N.getOperand(0)); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i); return true; } } // Peek through trunc/aext/zext/bitcast. // TODO: aext shouldn't require SM_SentinelZero padding. // TODO: handle shift of scalars. unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits(); while (Scl.getOpcode() == ISD::TRUNCATE || Scl.getOpcode() == ISD::ANY_EXTEND || Scl.getOpcode() == ISD::ZERO_EXTEND || (Scl.getOpcode() == ISD::BITCAST && Scl.getScalarValueSizeInBits() == Scl.getOperand(0).getScalarValueSizeInBits())) { Scl = Scl.getOperand(0); MinBitsPerElt = std::min(MinBitsPerElt, Scl.getScalarValueSizeInBits()); } if ((MinBitsPerElt % 8) != 0) return false; // Attempt to find the source vector the scalar was extracted from. SDValue SrcExtract; if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT || Scl.getOpcode() == X86ISD::PEXTRW || Scl.getOpcode() == X86ISD::PEXTRB) && Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) { SrcExtract = Scl; } if (!SrcExtract || !isa(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); if (!SrcVT.getScalarType().isByteSized()) return false; unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8); unsigned DstByte = DstIdx * NumBytesPerElt; MinBitsPerElt = std::min(MinBitsPerElt, SrcVT.getScalarSizeInBits()); // Create 'identity' byte level shuffle mask and then add inserted bytes. if (Opcode == ISD::SCALAR_TO_VECTOR) { Ops.push_back(SrcVec); Mask.append(NumSizeInBytes, SM_SentinelUndef); } else { Ops.push_back(SrcVec); Ops.push_back(N.getOperand(0)); for (int i = 0; i != (int)NumSizeInBytes; ++i) Mask.push_back(NumSizeInBytes + i); } unsigned MinBytesPerElts = MinBitsPerElt / 8; MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt); for (unsigned i = 0; i != MinBytesPerElts; ++i) Mask[DstByte + i] = SrcByte + i; for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i) Mask[DstByte + i] = SM_SentinelZero; return true; } case X86ISD::PACKSS: case X86ISD::PACKUS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); // If we know input saturation won't happen (or we don't care for particular // lanes), we can treat this as a truncation shuffle. bool Offset0 = false, Offset1 = false; if (Opcode == X86ISD::PACKSS) { if ((!(N0.isUndef() || EltsLHS.isZero()) && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || (!(N1.isUndef() || EltsRHS.isZero()) && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; // We can't easily fold ASHR into a shuffle, but if it was feeding a // PACKSS then it was likely being used for sign-extension for a // truncation, so just peek through and adjust the mask accordingly. if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) && N0.getConstantOperandAPInt(1) == NumBitsPerElt) { Offset0 = true; N0 = N0.getOperand(0); } if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) && N1.getConstantOperandAPInt(1) == NumBitsPerElt) { Offset1 = true; N1 = N1.getOperand(0); } } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); if ((!(N0.isUndef() || EltsLHS.isZero()) && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || (!(N1.isUndef() || EltsRHS.isZero()) && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } bool IsUnary = (N0 == N1); Ops.push_back(N0); if (!IsUnary) Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); if (Offset0 || Offset1) { for (int &M : Mask) if ((Offset0 && isInRange(M, 0, NumElts)) || (Offset1 && isInRange(M, NumElts, 2 * NumElts))) ++M; } return true; } case ISD::VSELECT: case X86ISD::BLENDV: { SDValue Cond = N.getOperand(0); if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) { Ops.push_back(N.getOperand(1)); Ops.push_back(N.getOperand(2)); return true; } return false; } case X86ISD::VTRUNC: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); // Truncated source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (SrcVT.getScalarSizeInBits() % 8) != 0) return false; unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt; assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation"); for (unsigned i = 0; i != NumSrcElts; ++i) Mask.push_back(i * Scale); Mask.append(NumElts - NumSrcElts, SM_SentinelZero); Ops.push_back(Src); return true; } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); // Out of range bit shifts are guaranteed to be zero. if (NumBitsPerElt <= ShiftVal) { Mask.append(NumElts, SM_SentinelZero); return true; } // We can only decode 'whole byte' bit shifts as shuffles. if ((ShiftVal % 8) != 0) break; uint64_t ByteShift = ShiftVal / 8; Ops.push_back(N.getOperand(0)); // Clear mask to all zeros and insert the shifted byte indices. Mask.append(NumSizeInBytes, SM_SentinelZero); if (X86ISD::VSHLI == Opcode) { for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j] = i + j - ByteShift; } else { for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j - ByteShift] = i + j; } return true; } case X86ISD::VROTLI: case X86ISD::VROTRI: { // We can only decode 'whole byte' bit rotates as shuffles. uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt); if ((RotateVal % 8) != 0) return false; Ops.push_back(N.getOperand(0)); int Offset = RotateVal / 8; Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset); for (int i = 0; i != (int)NumElts; ++i) { int BaseIdx = i * NumBytesPerElt; for (int j = 0; j != (int)NumBytesPerElt; ++j) { Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt)); } } return true; } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); if (!Src.getSimpleValueType().isVector()) { if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isNullConstant(Src.getOperand(1)) || Src.getOperand(0).getValueType().getScalarType() != VT.getScalarType()) return false; Src = Src.getOperand(0); } Ops.push_back(Src); Mask.append(NumElts, 0); return true; } case ISD::SIGN_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (NumBitsPerSrcElt % 8) != 0) return false; // We can only handle all-signbits extensions. APInt DemandedSrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt) return false; assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension"); unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt; for (unsigned I = 0; I != NumElts; ++I) Mask.append(Scale, I); Ops.push_back(Src); return true; } case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::ANY_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (SrcVT.getScalarSizeInBits() % 8) != 0) return false; bool IsAnyExtend = (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, IsAnyExtend, Mask); Ops.push_back(Src); return true; } } return false; } /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, SmallVectorImpl &Mask) { int MaskWidth = Mask.size(); SmallVector UsedInputs; for (int i = 0, e = Inputs.size(); i < e; ++i) { int lo = UsedInputs.size() * MaskWidth; int hi = lo + MaskWidth; // Strip UNDEF input usage. if (Inputs[i].isUndef()) for (int &M : Mask) if ((lo <= M) && (M < hi)) M = SM_SentinelUndef; // Check for unused inputs. if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { for (int &M : Mask) if (lo <= M) M -= MaskWidth; continue; } // Check for repeated inputs. bool IsRepeat = false; for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { if (UsedInputs[j] != Inputs[i]) continue; for (int &M : Mask) if (lo <= M) M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); IsRepeat = true; break; } if (IsRepeat) continue; UsedInputs.push_back(Inputs[i]); } Inputs = UsedInputs; } /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs /// and then sets the SM_SentinelUndef and SM_SentinelZero values. /// Returns true if the target shuffle mask was decoded. static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, APInt &KnownUndef, APInt &KnownZero, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { if (Depth >= SelectionDAG::MaxRecursionDepth) return false; // Limit search depth. EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { if (ResolveKnownElts) resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); return true; } if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, ResolveKnownElts)) { resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); return true; } return false; } static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { APInt KnownUndef, KnownZero; return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, KnownZero, DAG, Depth, ResolveKnownElts); } static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth = 0, bool ResolveKnownElts = true) { EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnes(NumElts); return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth, ResolveKnownElts); } // Attempt to create a scalar/subvector broadcast from the base MemSDNode. static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG) { assert((Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"); // Ensure this is a simple (non-atomic, non-voltile), temporal read memop. if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal()) return SDValue(); SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::getFixed(Offset), DL); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {Mem->getChain(), Ptr}; SDValue BcstLd = DAG.getMemIntrinsicNode( Opcode, DL, Tys, Ops, MemVT, DAG.getMachineFunction().getMachineMemOperand( Mem->getMemOperand(), Offset, MemVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1)); return BcstLd; } /// Returns the scalar element that will make up the i'th /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); // Limit search depth. EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); unsigned NumElems = VT.getVectorNumElements(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (auto *SV = dyn_cast(Op)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = VT.getSimpleVT(); MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; SmallVector ShuffleOps; if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelZero) return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT) : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT); if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range"); SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); } // Recurse into insert_subvector base/sub vector to find scalars. if (Opcode == ISD::INSERT_SUBVECTOR) { SDValue Vec = Op.getOperand(0); SDValue Sub = Op.getOperand(1); uint64_t SubIdx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1); return getShuffleScalarElt(Vec, Index, DAG, Depth + 1); } // Recurse into concat_vectors sub vector to find scalars. if (Opcode == ISD::CONCAT_VECTORS) { EVT SubVT = Op.getOperand(0).getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); uint64_t SubIdx = Index / NumSubElts; uint64_t SubElt = Index % NumSubElts; return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1); } // Recurse into extract_subvector src vector to find scalars. if (Opcode == ISD::EXTRACT_SUBVECTOR) { SDValue Src = Op.getOperand(0); uint64_t SrcIdx = Op.getConstantOperandVal(1); return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1); } // We only peek through bitcasts of the same vector width. if (Opcode == ISD::BITCAST) { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems) return getShuffleScalarElt(Src, Index, DAG, Depth + 1); return SDValue(); } // Actual nodes that may contain scalar elements // For insert_vector_elt - either return the index matching scalar or recurse // into the base vector. if (Opcode == ISD::INSERT_VECTOR_ELT && isa(Op.getOperand(2))) { if (Op.getConstantOperandAPInt(2) == Index) return Op.getOperand(1); return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1); } if (Opcode == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? Op.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (Opcode == ISD::BUILD_VECTOR) return Op.getOperand(Index); return SDValue(); } // Use PINSRB/PINSRW/PINSRD to create a build vector. static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion"); SDValue V; bool First = true; for (unsigned i = 0; i < NumElts; ++i) { bool IsNonZero = NonZeroMask[i]; if (!IsNonZero) continue; // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) V = getZeroVector(VT, Subtarget, DAG, DL); else { assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i), DAG.getIntPtrConstant(i, DL)); } return V; } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget); SDValue V; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. // If both the lowest 16-bits are non-zero, then convert to MOVD. if (!NonZeroMask.extractBits(2, 0).isZero() && !NonZeroMask.extractBits(2, 2).isZero()) { for (unsigned I = 0; I != 4; ++I) { if (!NonZeroMask[I]) continue; SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32); if (I != 0) Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, DAG.getConstant(I * 8, DL, MVT::i8)); V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt; } assert(V && "Failed to fold v16i8 vector to zero"); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V); V = DAG.getBitcast(MVT::v8i16, V); } for (unsigned i = V ? 4 : 0; i < 16; i += 2) { bool ThisIsNonZero = NonZeroMask[i]; bool NextIsNonZero = NonZeroMask[i + 1]; if (!ThisIsNonZero && !NextIsNonZero) continue; SDValue Elt; if (ThisIsNonZero) { if (NumZero || NextIsNonZero) Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32); else Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32); } if (NextIsNonZero) { SDValue NextElt = Op.getOperand(i + 1); if (i == 0 && NumZero) NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32); else NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32); NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt, DAG.getConstant(8, DL, MVT::i8)); if (ThisIsNonZero) Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt); else Elt = NextElt; } // If our first insertion is not the first index or zeros are needed, then // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high // elements undefined). if (!V) { if (i != 0 || NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); else { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt); V = DAG.getBitcast(MVT::v8i16, V); continue; } } Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt, DAG.getIntPtrConstant(i / 2, DL)); } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); // Use PINSRW to insert each byte directly. return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget); } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If this is a splat of a pair of elements, use MOVDDUP (unless the target // has XOP; in that case defer lowering to potentially use VPERMIL2PS). // Because we're creating a less complicated build vector here, we may enable // further folding of the MOVDDUP via shuffle transforms. if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && Op.getOperand(0) == Op.getOperand(2) && Op.getOperand(1) == Op.getOperand(3) && Op.getOperand(0) != Op.getOperand(1)) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); // Create a new build vector with the first 2 elements followed by undef // padding, bitcast to v2f64, duplicate, and bitcast back. SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); return DAG.getBitcast(VT, Dup); } // Find all zeroable elements. std::bitset<4> Zeroable, Undefs; for (int i = 0; i < 4; ++i) { SDValue Elt = Op.getOperand(i); Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZeroOrUndef = (Zeroable == Undefs) ? DAG.getUNDEF(VT) : getZeroVector(VT, Subtarget, DAG, DL); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget.hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getBitcast(MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getBitcast(MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL, true)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v16i8; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || !LD->isSimple()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. Align RequiredAlign(VT.getSizeInBits() / 8); SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr); if (!InferredAlign || *InferredAlign < RequiredAlign) { if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI.setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign.value()) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset)); SmallVector Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); } return SDValue(); } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { if (ISD::isNON_EXTLoad(Elt.getNode())) { auto *BaseLd = cast(Elt); if (!BaseLd->isSimple()) return false; Ld = BaseLd; ByteOffset = 0; return true; } switch (Elt.getOpcode()) { case ISD::BITCAST: case ISD::TRUNCATE: case ISD::SCALAR_TO_VECTOR: return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); case ISD::SRL: if (auto *AmtC = dyn_cast(Elt.getOperand(1))) { uint64_t Amt = AmtC->getZExtValue(); if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { ByteOffset += Amt / 8; return true; } } break; case ISD::EXTRACT_VECTOR_ELT: if (auto *IdxC = dyn_cast(Elt.getOperand(1))) { SDValue Src = Elt.getOperand(0); unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && findEltLoadSrc(Src, Ld, ByteOffset)) { uint64_t Idx = IdxC->getZExtValue(); ByteOffset += Idx * (SrcSizeInBits / 8); return true; } } break; } return false; } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize) { if ((VT.getScalarSizeInBits() % 8) != 0) return SDValue(); unsigned NumElems = Elts.size(); int LastLoadedElt = -1; APInt LoadMask = APInt::getZero(NumElems); APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an // undef. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); if (Elt.isUndef()) { UndefMask.setBit(i); continue; } if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { ZeroMask.setBit(i); continue; } // Each loaded element must be the correct fractional portion of the // requested vector load. unsigned EltSizeInBits = Elt.getValueSizeInBits(); if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) return SDValue(); if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) return SDValue(); unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) return SDValue(); LoadMask.setBit(i); LastLoadedElt = i; } assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. if (UndefMask.popcount() == NumElems) return DAG.getUNDEF(VT); if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); int FirstLoadedElt = LoadMask.countr_zero(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); LoadSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt); int LoadSizeInBits = NumLoadedElts * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // TODO: Support offsetting the base load. if (ByteOffsets[FirstLoadedElt] != 0) return SDValue(); // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { LoadSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, EltIdx - FirstLoadedElt); }; // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. bool IsConsecutiveLoad = true; bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { if (!CheckConsecutiveLoad(LDBase, i)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; } } else if (ZeroMask[i]) { IsConsecutiveLoad = false; } } auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getOriginalAlign(), MMOFlags); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; // Check if the base load is entirely dereferenceable. bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); // LOAD - all consecutive load/undefs (must start/end with a load or be // entirely dereferenceable). If we have found an entire vector of loads and // undefs, then return a large load of the entire vector width starting at the // base pointer. If the vector contains zeros, then attempt to shuffle those // elements. if (FirstLoadedElt == 0 && (NumLoadedElts == (int)NumElems || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); // Don't create 256-bit non-temporal aligned loads without AVX2 as these // will lower to regular temporal loads and use the cache. if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) && VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); if (NumElems == 1) return DAG.getBitcast(VT, Elts[FirstLoadedElt]); if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!IsAfterLegalize && VT.isVector()) { unsigned NumMaskElts = VT.getVectorNumElements(); if ((NumMaskElts % NumElems) == 0) { unsigned Scale = NumMaskElts / NumElems; SmallVector ClearMask(NumMaskElts, -1); for (unsigned i = 0; i < NumElems; ++i) { if (UndefMask[i]) continue; int Offset = ZeroMask[i] ? NumMaskElts : 0; for (unsigned j = 0; j != Scale; ++j) ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; } SDValue V = CreateLoad(VT, LDBase); SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } } // If the upper half of a ymm/zmm load is undef then just load the lower half. if (VT.is256BitVector() || VT.is512BitVector()) { unsigned HalfNumElems = NumElems / 2; if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) { EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, DAG, Subtarget, IsAfterLegalize); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getIntPtrConstant(0, DL)); } } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); // Allow v4f32 on SSE1 only targets. // FIXME: Add more isel patterns so we can just use VT directly. if (!Subtarget.hasSSE2() && VT == MVT::v4f32) VecVT = MVT::v4f32; if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode( X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), LDBase->getOriginalAlign(), MachineMemOperand::MOLoad); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } // BROADCAST - match the smallest possible repetition pattern, load that // scalar/subvector element and then broadcast to the entire vector. if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { unsigned RepeatSize = SubElems * BaseSizeInBits; unsigned ScalarSize = std::min(RepeatSize, 64u); if (!Subtarget.hasAVX2() && ScalarSize < 32) continue; // Don't attempt a 1:N subvector broadcast - it should be caught by // combineConcatVectorOps, else will cause infinite loops. if (RepeatSize > ScalarSize && SubElems == 1) continue; bool Match = true; SmallVector RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); for (unsigned i = 0; i != NumElems && Match; ++i) { if (!LoadMask[i]) continue; SDValue Elt = peekThroughBitcasts(Elts[i]); if (RepeatedLoads[i % SubElems].isUndef()) RepeatedLoads[i % SubElems] = Elt; else Match &= (RepeatedLoads[i % SubElems] == Elt); } // We must have loads at both ends of the repetition. Match &= !RepeatedLoads.front().isUndef(); Match &= !RepeatedLoads.back().isUndef(); if (!Match) continue; EVT RepeatVT = VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) : EVT::getFloatingPointVT(ScalarSize); if (RepeatSize > ScalarSize) RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, RepeatSize / ScalarSize); EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) { SDValue Broadcast = RepeatLoad; if (RepeatSize > ScalarSize) { while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); } else { if (!Subtarget.hasAVX2() && !X86::mayFoldLoadIntoBroadcastFromMem( RepeatLoad, RepeatVT.getScalarType().getSimpleVT(), Subtarget, /*AssumeSingleUse=*/true)) return SDValue(); Broadcast = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad); } return DAG.getBitcast(VT, Broadcast); } } } } return SDValue(); } // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses // are consecutive, non-overlapping, and in the right order. static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize) { SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) { Elts.push_back(Elt); continue; } return SDValue(); } assert(Elts.size() == VT.getVectorNumElements()); return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, IsAfterLegalize); } static Constant *getConstantVector(MVT VT, ArrayRef Bits, const APInt &Undefs, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C); auto getConstantScalar = [&](const APInt &Val) -> Constant * { if (VT.isFloatingPoint()) { if (ScalarSize == 16) return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val)); if (ScalarSize == 32) return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); assert(ScalarSize == 64 && "Unsupported floating point scalar size"); return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } return Constant::getIntegerValue(Ty, Val); }; SmallVector ConstantVec; for (unsigned I = 0, E = Bits.size(); I != E; ++I) ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty) : getConstantScalar(Bits[I])); return ConstantVector::get(ArrayRef(ConstantVec)); } static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); auto getConstantScalar = [&](const APInt &Val) -> Constant * { if (VT.isFloatingPoint()) { if (ScalarSize == 16) return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val)); if (ScalarSize == 32) return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); assert(ScalarSize == 64 && "Unsupported floating point scalar size"); return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); }; if (ScalarSize == SplatBitSize) return getConstantScalar(SplatValue); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector ConstantVec; for (unsigned I = 0; I != NumElm; ++I) { APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I); ConstantVec.push_back(getConstantScalar(Val)); } return ConstantVector::get(ArrayRef(ConstantVec)); } static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->uses()) { unsigned Opc = U->getOpcode(); // VPERMV/VPERMV3 shuffles can never fold their index operands. if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) return false; if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) return false; if (isTargetShuffle(Opc)) return true; if (Opc == ISD::BITCAST) // Ignore bitcasts return isFoldableUseOfShuffle(U); if (N->hasOneUse()) { // TODO, there may be some general way to know if a SDNode can // be folded. We now only know whether an MI is foldable. if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N) return false; return true; } } return false; } /// Attempt to use the vbroadcast instruction to generate a splat value /// from a splat BUILD_VECTOR which uses: /// a. A single scalar load, or a constant. /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget.hasAVX()) return SDValue(); MVT VT = BVOp->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); // See if the build vector is a repeating sequence of scalars (inc. splat). SDValue Ld; BitVector UndefElements; SmallVector Sequence; if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) { assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit."); if (Sequence.size() == 1) Ld = Sequence[0]; } // Attempt to use VBROADCASTM // From this pattern: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) // b. t1 = (build_vector t0 t0) // // Create (VBROADCASTM v2i1 X) if (!Sequence.empty() && Subtarget.hasCDI()) { // If not a splat, are the upper sequence values zeroable? unsigned SeqLen = Sequence.size(); bool UpperZeroOrUndef = SeqLen == 1 || llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) { return !V || isNullConstantOrUndef(V); }); SDValue Op0 = Sequence[0]; if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) || (Op0.getOpcode() == ISD::ZERO_EXTEND && Op0.getOperand(0).getOpcode() == ISD::BITCAST))) { SDValue BOperand = Op0.getOpcode() == ISD::BITCAST ? Op0.getOperand(0) : Op0.getOperand(0).getOperand(0); MVT MaskVT = BOperand.getSimpleValueType(); MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen); if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen); if (!VT.is512BitVector() && !Subtarget.hasVLX()) { unsigned Scale = 512 / VT.getSizeInBits(); BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen)); } SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand); if (BcstVT.getSizeInBits() != VT.getSizeInBits()) Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits()); return DAG.getBitcast(VT, Bcst); } } } unsigned NumUndefElts = UndefElements.count(); if (!Ld || (NumElts - NumUndefElts) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; // Check if this is a repeated constant pattern suitable for broadcasting. if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && SplatBitSize > VT.getScalarSizeInBits() && SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); if (SplatBitSize == 32 || SplatBitSize == 64 || (SplatBitSize < 32 && Subtarget.hasAVX2())) { // Load the constant scalar/subvector and broadcast it. MVT CVT = MVT::getIntegerVT(SplatBitSize); Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; Align Alignment = cast(CP)->getAlign(); SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CP}; MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); SDValue Brdcst = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, MachineMemOperand::MOLoad); return DAG.getBitcast(VT, Brdcst); } if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); Align Alignment = cast(VCP)->getAlign(); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), VCP}; MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, MachineMemOperand::MOLoad); } } // If we are moving a scalar into a vector (Ld must be set and all elements // but 1 are undef) and that operation is not obviously supported by // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. // That's better than general shuffling and may eliminate a load to GPR and // move from scalar to vector register. if (!Ld || NumElts - NumUndefElts != 1) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) return SDValue(); } bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // TODO: Handle broadcasts of non-constant sequences. // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. // FIXME: Is the use count needed for non-constant, non-load case? if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || (CVT == MVT::f16 && Subtarget.hasAVX2()) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); Align Alignment = cast(CP)->getAlign(); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CP}; MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, MachineMemOperand::MOLoad); } } // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); // Make sure the non-chain result is only used by this build vector. if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0)) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget.hasVLX() && ScalarSize == 64)) { auto *LN = cast(Ld); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BCast = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); return BCast; } // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget.hasInt256() && Ld.getValueType().isInteger() && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) { auto *LN = cast(Ld); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BCast = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); return BCast; } if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // Unsupported broadcast. return SDValue(); } /// For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = ExtIdx->getAsZExtVal(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); return NV; } // Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types. static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); MVT IVT = VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16); SmallVector NewOps; for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16, Op.getOperand(I))); SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); return DAG.getBitcast(VT, Res); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); if (ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorAllOnes(Op.getNode())) return Op; uint64_t Immediate = 0; SmallVector NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.isUndef()) continue; if (auto *InC = dyn_cast(In)) { Immediate |= (InC->getZExtValue() & 0x1) << idx; HasConstElts = true; } else { NonConstIdx.push_back(idx); } if (SplatIdx < 0) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) { // The build_vector allows the scalar element to be larger than the vector // element type. We need to mask it to use as a condition unless we know // the upper bits are zero. // FIXME: Use computeKnownBits instead of checking specific opcode? SDValue Cond = Op.getOperand(SplatIdx); assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!"); if (Cond.getOpcode() != ISD::SETCC) Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, DAG.getConstant(1, dl, MVT::i8)); // Perform the select in the scalar domain so we can use cmov. if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue Select = DAG.getSelect(dl, MVT::i32, Cond, DAG.getAllOnesConstant(dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); Select = DAG.getBitcast(MVT::v32i1, Select); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select); } else { MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Select = DAG.getSelect(dl, ImmVT, Cond, DAG.getAllOnesConstant(dl, ImmVT), DAG.getConstant(0, dl, ImmVT)); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; Select = DAG.getBitcast(VecVT, Select); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select, DAG.getIntPtrConstant(0, dl)); } } // insert elements one by one SDValue DstVec; if (HasConstElts) { if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); ImmL = DAG.getBitcast(MVT::v32i1, ImmL); ImmH = DAG.getBitcast(MVT::v32i1, ImmH); DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); } else { MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; DstVec = DAG.getBitcast(VecVT, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, DAG.getIntPtrConstant(0, dl)); } } else DstVec = DAG.getUNDEF(VT); for (unsigned InsertIdx : NonConstIdx) { DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getIntPtrConstant(InsertIdx, dl)); } return DstVec; } LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) { switch (Opcode) { case X86ISD::PACKSS: case X86ISD::PACKUS: case X86ISD::FHADD: case X86ISD::FHSUB: case X86ISD::HADD: case X86ISD::HSUB: return true; } return false; } /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// 128-bit partial horizontal operation on a 256-bit vector, but that operation /// may not match the layout of an x86 256-bit horizontal instruction. /// In other words, if this returns true, then some extraction/insertion will /// be required to produce a valid horizontal instruction. /// /// Parameter \p Opcode defines the kind of horizontal operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). /// /// TODO: This function was originally used to match both real and fake partial /// horizontal operations, but the index-matching logic is incorrect for that. /// See the corrected implementation in isHopBuildVector(). Can we reduce this /// code because it is only used for partial h-op matching now? static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->isUndef()) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = Op0.getConstantOperandVal(1); unsigned I1 = Op1.getConstantOperandVal(1); if (i * 2 < NumElts) { if (V0.isUndef()) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.isUndef()) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { MVT VT = V0.getSimpleValueType(); assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); MVT NewVT = V0_LO.getSimpleValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && !V0->isUndef()) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && !V1->isUndef()) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Returns true iff \p BV builds a vector with the result equivalent to /// the result of ADDSUB/SUBADD operation. /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters /// \p Opnd0 and \p Opnd1. static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd) { MVT VT = BV->getSimpleValueType(0); if (!Subtarget.hasSSE3() || !VT.isFloatingPoint()) return false; unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; // Odd-numbered elements in the input build vector are obtained from // adding/subtracting two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting/adding two integer/float elements. unsigned Opc[2] = {0, 0}; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) continue; // Early exit if we found an unexpected opcode. if (Opcode != ISD::FADD && Opcode != ISD::FSUB) return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op0.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return false; unsigned I0 = Op0.getConstantOperandVal(1); if (I0 != i) return false; // We found a valid add/sub node, make sure its the same opcode as previous // elements for this parity. if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode) return false; Opc[i % 2] = Opcode; // Update InVec0 and InVec1. if (InVec0.isUndef()) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return false; } if (InVec1.isUndef()) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return false; } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (Opcode == ISD::FSUB) return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return false; } if (InVec1 != Op1.getOperand(0)) return false; // Increment the number of extractions done. ++NumExtracts; } // Ensure we have found an opcode for both parities and that they are // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the // inputs are undef. if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || InVec0.isUndef() || InVec1.isUndef()) return false; IsSubAdd = Opc[0] == ISD::FADD; Opnd0 = InVec0; Opnd1 = InVec1; return true; } /// Returns true if is possible to fold MUL and an idiom that has already been /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called /// before replacement of such SDNode with ADDSUB operation. Thus the number /// of \p Opnd0 uses is expected to be equal to 2. /// For example, this function may be called for the following IR: /// %AB = fmul fast <2 x double> %A, %B /// %Sub = fsub fast <2 x double> %AB, %C /// %Add = fadd fast <2 x double> %AB, %C /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, /// <2 x i32> /// There is a def for %Addsub here, which potentially can be replaced by /// X86ISD::ADDSUB operation: /// %Addsub = X86ISD::ADDSUB %AB, %C /// and such ADDSUB can further be replaced with FMADDSUB: /// %Addsub = FMADDSUB %A, %B, %C. /// /// The main reason why this method is called before the replacement of the /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. const TargetOptions &Options = DAG.getTarget().Options; bool AllowFusion = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); if (!AllowFusion) return false; Opnd2 = Opnd1; Opnd1 = Opnd0.getOperand(1); Opnd0 = Opnd0.getOperand(0); return true; } /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or /// X86ISD::FMSUBADD node. static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; unsigned NumExtracts; bool IsSubAdd; if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd)) return SDValue(); MVT VT = BV->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } // We only support ADDSUB. if (IsSubAdd) return SDValue(); // There are no known X86 targets with 512-bit ADDSUB instructions! // Convert to blend(fsub,fadd). if (VT.is512BitVector()) { SmallVector Mask; for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) { Mask.push_back(I); Mask.push_back(I + E + 1); } SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1); SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1); return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask); } return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1) { // Initialize outputs to known values. MVT VT = BV->getSimpleValueType(0); HOpcode = ISD::DELETED_NODE; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit // half of the result is calculated independently from the 128-bit halves of // the inputs, so that makes the index-checking logic below more complicated. unsigned NumElts = VT.getVectorNumElements(); unsigned GenericOpcode = ISD::DELETED_NODE; unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1; unsigned NumEltsIn128Bits = NumElts / Num128BitChunks; unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2; for (unsigned i = 0; i != Num128BitChunks; ++i) { for (unsigned j = 0; j != NumEltsIn128Bits; ++j) { // Ignore undef elements. SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j); if (Op.isUndef()) continue; // If there's an opcode mismatch, we're done. if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode) return false; // Initialize horizontal opcode. if (HOpcode == ISD::DELETED_NODE) { GenericOpcode = Op.getOpcode(); switch (GenericOpcode) { // clang-format off case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: return false; // clang-format on } } SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op0.getOperand(0) != Op1.getOperand(0) || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || !Op.hasOneUse()) return false; // The source vector is chosen based on which 64-bit half of the // destination vector is being calculated. if (j < NumEltsIn64Bits) { if (V0.isUndef()) V0 = Op0.getOperand(0); } else { if (V1.isUndef()) V1 = Op0.getOperand(0); } SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1; if (SourceVec != Op0.getOperand(0)) return false; // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) unsigned ExtIndex0 = Op0.getConstantOperandVal(1); unsigned ExtIndex1 = Op1.getConstantOperandVal(1); unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2; if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) continue; // If this is not a commutative op, this does not match. if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD) return false; // Addition is commutative, so try swapping the extract indexes. // op (extract_vector_elt A, I+1), (extract_vector_elt A, I) if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1) continue; // Extract indexes do not match horizontal requirement. return false; } } // We matched. Opcode and operands are returned by reference as arguments. return true; } static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1) { // If either input vector is not the same size as the build vector, // extract/insert the low bits to the correct size. // This is free (examples: zmm --> xmm, xmm --> ymm). MVT VT = BV->getSimpleValueType(0); unsigned Width = VT.getSizeInBits(); if (V0.getValueSizeInBits() > Width) V0 = extractSubVector(V0, 0, DAG, DL, Width); else if (V0.getValueSizeInBits() < Width) V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width); if (V1.getValueSizeInBits() > Width) V1 = extractSubVector(V1, 0, DAG, DL, Width); else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width); unsigned NumElts = VT.getVectorNumElements(); APInt DemandedElts = APInt::getAllOnes(NumElts); for (unsigned i = 0; i != NumElts; ++i) if (BV->getOperand(i).isUndef()) DemandedElts.clearBit(i); // If we don't need the upper xmm, then perform as a xmm hop. unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { MVT HalfVT = VT.getHalfNumVectorElementsVT(); V0 = extractSubVector(V0, 0, DAG, DL, 128); V1 = extractSubVector(V1, 0, DAG, DL, 128); SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1); return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256); } return DAG.getNode(HOpcode, DL, VT, V0, V1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. unsigned NumNonUndefs = count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); // There are 4 sets of horizontal math operations distinguished by type: // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { unsigned HOpcode; SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1); } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) return SDValue(); // Count the number of UNDEF operands in the build_vector in input. unsigned NumElts = VT.getVectorNumElements(); unsigned Half = NumElts / 2; unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsHI++; SDValue InVec0, InVec1; if (VT == MVT::v8i32 || VT == MVT::v16i16) { SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binops followed by // a concat vector. We must adjust the outputs from the partial horizontal // matching calls above to account for undefined vector halves. SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) { unsigned X86Opcode; if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG); /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Check that all elements have the same opcode. // TODO: Should we allow UNDEFS and if so how many? unsigned Opcode = Op->getOperand(0).getOpcode(); for (unsigned i = 1; i < NumElems; ++i) if (Opcode != Op->getOperand(i).getOpcode()) return SDValue(); // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). bool IsShift = false; switch (Opcode) { default: return SDValue(); case ISD::SHL: case ISD::SRL: case ISD::SRA: IsShift = true; break; case ISD::AND: case ISD::XOR: case ISD::OR: // Don't do this if the buildvector is a splat - we'd replace one // constant with an entire vector. if (Op->getSplatValue()) return SDValue(); if (!TLI.isOperationLegalOrPromote(Opcode, VT)) return SDValue(); break; } SmallVector LHSElts, RHSElts; for (SDValue Elt : Op->ops()) { SDValue LHS = Elt.getOperand(0); SDValue RHS = Elt.getOperand(1); // We expect the canonicalized RHS operand to be the constant. if (!isa(RHS)) return SDValue(); // Extend shift amounts. if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { if (!IsShift) return SDValue(); RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); } LHSElts.push_back(LHS); RHSElts.push_back(RHS); } // Limit to shifts by uniform immediates. // TODO: Only accept vXi8/vXi64 special cases? // TODO: Permit non-uniform XOP/AVX2/MULLO cases? if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) return SDValue(); SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); if (!IsShift) return Res; // Immediately lower the shift to ensure the constant build vector doesn't // get converted to a constant pool before the shift is lowered. return LowerShift(Res, Subtarget, DAG); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. if (ISD::isBuildVectorAllZeros(Op.getNode())) return Op; // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getOnesVector(VT, DAG, DL); } return SDValue(); } /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute /// from a vector of source values and a vector of extraction indices. /// The vectors might be manipulated to match the type of the permute op. static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT ShuffleVT = VT; EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); unsigned NumElts = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); // Adjust IndicesVec to match VT size. assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts && "Illegal variable permute mask size"); if (IndicesVec.getValueType().getVectorNumElements() > NumElts) { // Narrow/widen the indices vector to the correct size. if (IndicesVec.getValueSizeInBits() > SizeInBits) IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), NumElts * VT.getScalarSizeInBits()); else if (IndicesVec.getValueSizeInBits() < SizeInBits) IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec), SizeInBits); // Zero-extend the index elements within the vector. if (IndicesVec.getValueType().getVectorNumElements() > NumElts) IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec), IndicesVT, IndicesVec); } IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); // Handle SrcVec that don't match VT type. if (SrcVec.getValueSizeInBits() != SizeInBits) { if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) { // Handle larger SrcVec by treating it as a larger permute. unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits; VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts); IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); SDValue NewSrcVec = createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); if (NewSrcVec) return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits); return SDValue(); } else if (SrcVec.getValueSizeInBits() < SizeInBits) { // Widen smaller SrcVec to match VT. SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); } else return SDValue(); } auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) { assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"); EVT SrcVT = Idx.getValueType(); unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale; uint64_t IndexScale = 0; uint64_t IndexOffset = 0; // If we're scaling a smaller permute op, then we need to repeat the // indices, scaling and offsetting them as well. // e.g. v4i32 -> v16i8 (Scale = 4) // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4) // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0) for (uint64_t i = 0; i != Scale; ++i) { IndexScale |= Scale << (i * NumDstBits); IndexOffset |= i << (i * NumDstBits); } Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT)); Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT)); return Idx; }; unsigned Opcode = 0; switch (VT.SimpleTy) { default: break; case MVT::v16i8: if (Subtarget.hasSSSE3()) Opcode = X86ISD::PSHUFB; break; case MVT::v8i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v4f32: case MVT::v4i32: if (Subtarget.hasAVX()) { Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v4f32; } else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v2f64: case MVT::v2i64: if (Subtarget.hasAVX()) { // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v2f64; } else if (Subtarget.hasSSE41()) { // SSE41 can compare v2i64 - select between indices 0 and 1. return DAG.getSelectCC( DL, IndicesVec, getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}), ISD::CondCode::SETEQ); } break; case MVT::v32i8: if (Subtarget.hasVLX() && Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasXOP()) { SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL); SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL); SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL); return DAG.getNode( ISD::CONCAT_VECTORS, DL, VT, DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx), DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx)); } else if (Subtarget.hasAVX()) { SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL); SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo); SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi); auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Permute Lo and Hi and then select based on index range. // This works as SHUFB uses bits[3:0] to permute elements and we don't // care about the bit[7] as its just an index vector. SDValue Idx = Ops[2]; EVT VT = Idx.getValueType(); return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx), ISD::CondCode::SETGT); }; SDValue Ops[] = {LoLo, HiHi, IndicesVec}; return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops, PSHUFBBuilder); } break; case MVT::v16i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { // Scale to v32i8 and perform as v32i8. IndicesVec = ScaleIndices(IndicesVec, 2); return DAG.getBitcast( VT, createVariablePermute( MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec), DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget)); } break; case MVT::v8f32: case MVT::v8i32: if (Subtarget.hasAVX2()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {0, 1, 2, 3, 0, 1, 2, 3}); SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); if (Subtarget.hasXOP()) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPS only uses index bits[0:1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v4i64: case MVT::v4f64: if (Subtarget.hasAVX512()) { if (!Subtarget.hasVLX()) { MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8); SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL, DAG, Subtarget); return extract256BitVector(Res, 0, DAG, DL); } Opcode = X86ISD::VPERMV; } else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); SDValue HiHi = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); if (Subtarget.hasXOP()) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPD only uses index bit[1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v64i8: if (Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; break; case MVT::v32i16: if (Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; break; case MVT::v16f32: case MVT::v16i32: case MVT::v8f64: case MVT::v8i64: if (Subtarget.hasAVX512()) Opcode = X86ISD::VPERMV; break; } if (!Opcode) return SDValue(); assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && "Illegal variable permute shuffle type"); uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); if (Scale > 1) IndicesVec = ScaleIndices(IndicesVec, Scale); EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); SDValue Res = Opcode == X86ISD::VPERMV ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec) : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec); return DAG.getBitcast(VT, Res); } // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be // reasoned to be a permutation of a vector by indices in a non-constant vector. // (build_vector (extract_elt V, (extract_elt I, 0)), // (extract_elt V, (extract_elt I, 1)), // ... // -> // (vpermv I, V) // // TODO: Handle undefs // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { SDValue Op = V.getOperand(Idx); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract encountered in V, set the source vector, // otherwise verify the extract is from the previously defined source // vector. if (!SrcVec) SrcVec = Op.getOperand(0); else if (SrcVec != Op.getOperand(0)) return SDValue(); SDValue ExtractedIndex = Op->getOperand(1); // Peek through extends. if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) ExtractedIndex = ExtractedIndex.getOperand(0); if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract from the index vector candidate, set the // indices vector, otherwise verify the extract is from the previously // defined indices vector. if (!IndicesVec) IndicesVec = ExtractedIndex.getOperand(0); else if (IndicesVec != ExtractedIndex.getOperand(0)) return SDValue(); auto *PermIdx = dyn_cast(ExtractedIndex.getOperand(1)); if (!PermIdx || PermIdx->getAPIntValue() != Idx) return SDValue(); } MVT VT = V.getSimpleValueType(); return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); MVT OpEltVT = Op.getOperand(0).getSimpleValueType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget); if (VT.getVectorElementType() == MVT::bf16 && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget); if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget)) return VectorCst; unsigned EVTBits = EltVT.getSizeInBits(); APInt UndefMask = APInt::getZero(NumElems); APInt FrozenUndefMask = APInt::getZero(NumElems); APInt ZeroMask = APInt::getZero(NumElems); APInt NonZeroMask = APInt::getZero(NumElems); bool IsAllConstants = true; bool OneUseFrozenUndefs = true; SmallSet Values; unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.isUndef()) { UndefMask.setBit(i); continue; } if (ISD::isFreezeUndef(Elt.getNode())) { OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse(); FrozenUndefMask.setBit(i); continue; } Values.insert(Elt); if (!isIntOrFPConstant(Elt)) { IsAllConstants = false; NumConstants--; } if (X86::isZeroNode(Elt)) { ZeroMask.setBit(i); } else { NonZeroMask.setBit(i); } } // All undef vector. Return an UNDEF. if (UndefMask.isAllOnes()) return DAG.getUNDEF(VT); // All undef/freeze(undef) vector. Return a FREEZE UNDEF. if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes()) return DAG.getFreeze(DAG.getUNDEF(VT)); // All undef/freeze(undef)/zero vector. Return a zero vector. if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes()) return getZeroVector(VT, Subtarget, DAG, dl); // If we have multiple FREEZE-UNDEF operands, we are likely going to end up // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR, // and blend the FREEZE-UNDEF operands back in. // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand? if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount(); NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) { SmallVector BlendMask(NumElems, -1); SmallVector Elts(NumElems, DAG.getUNDEF(OpEltVT)); for (unsigned i = 0; i < NumElems; ++i) { if (UndefMask[i]) { BlendMask[i] = -1; continue; } BlendMask[i] = i; if (!FrozenUndefMask[i]) Elts[i] = Op.getOperand(i); else BlendMask[i] += NumElems; } SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts); SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT)); SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt); return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask); } BuildVectorSDNode *BV = cast(Op.getNode()); // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might // be better off lowering to a smaller build vector and padding with // undef/zero. if ((VT.is256BitVector() || VT.is512BitVector()) && !isFoldableUseOfShuffle(BV)) { unsigned UpperElems = NumElems / 2; APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask; unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one(); if (NumUpperUndefsOrZeros >= UpperElems) { if (VT.is512BitVector() && NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4))) UpperElems = NumElems - (NumElems / 4); // If freeze(undef) is in any upper elements, force to zero. bool UndefUpper = UndefMask.countl_one() >= UpperElems; MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems); SDValue NewBV = DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems)); return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl); } } if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG)) return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG)) return BitOp; unsigned NumZero = ZeroMask.popcount(); unsigned NumNonZero = NonZeroMask.popcount(); // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not // supported, fall back to a shuffle to get the scalar blended with the // constants. Insertion into a zero vector is handled as a special-case // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && FrozenUndefMask.isZero() && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { // Create an all-constant vector. The variable element in the old // build vector is replaced by undef in the constant vector. Save the // variable scalar element and its index for use in the insertelement. LLVMContext &Context = *DAG.getContext(); Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); SmallVector ConstVecOps(NumElems, UndefValue::get(EltType)); SDValue VarElt; SDValue InsIndex; for (unsigned i = 0; i != NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); else if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); else if (!Elt.isUndef()) { assert(!VarElt.getNode() && !InsIndex.getNode() && "Expected one variable element in this vector"); VarElt = Elt; InsIndex = DAG.getVectorIdxConstant(i, dl); } } Constant *CV = ConstantVector::get(ConstVecOps); SDValue DAGConstVec = DAG.getConstantPool(CV, VT); // The constants we just created may not be legal (eg, floating point). We // must lower the vector right here because we can not guarantee that we'll // legalize it before loading it. This is also why we could not just create // a new build vector here. If the build vector contains illegal constants, // it could get split back up into a series of insert elements. // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); unsigned InsertC = InsIndex->getAsZExtVal(); unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); if (InsertC < NumEltsInLow128Bits) return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); // There's no good way to insert into the high elements of a >128-bit // vector, so use shuffles to avoid an extract/insert sequence. assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) ShuffleMask.push_back(i == InsertC ? NumElts : i); SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = NonZeroMask.countr_zero(); SDValue Item = Op.getOperand(Idx); // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) || (EltVT == MVT::i16 && Subtarget.hasFP16())) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a // zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = NonZeroMask.countr_zero(); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget)) return V; // See if we can use a vector load to get all of the elements. { SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } // If this is a splat of pairs of 32-bit elements, we can use a narrower // build_vector and broadcast it. // TODO: We could probably generalize this more. if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { // Make sure all the even/odd operands match. for (unsigned i = 2; i != NumElems; ++i) if (Ops[i % 2] != Op.getOperand(i)) return false; return true; }; if (CanSplat(Op, NumElems, Ops)) { MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; MVT NarrowVT = MVT::getVectorVT(EltVT, 4); // Create a new build vector and cast to v2i64/v2f64. SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), DAG.getBuildVector(NarrowVT, dl, Ops)); // Broadcast from v2i64/v2f64 and cast to final VT. MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV)); } } // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.getSizeInBits() > 128) { MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2); // Build both the lower and upper subvector. SDValue Lower = DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. return concatSubVectors(Lower, Upper, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = NonZeroMask.countr_zero(); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; if (EltVT == MVT::i16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. if (NumElems == 4 && NumZero > 0) { SmallVector Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { bool isZero = !NonZeroMask[i]; if (isZero) Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) { default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. break; case 1: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); break; case 2: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; case 3: Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; } } bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2; bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41() && EltVT != MVT::f16) { SDValue Result; if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). SmallVector Ops(NumElems); for (unsigned i = 0; i < NumElems; ++i) { if (!Op.getOperand(i).isUndef()) Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else Ops[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 1 ==> X: // : unpcklps 2, 3 ==> Y: // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { // Generate scaled UNPCKL shuffle mask. SmallVector Mask; for(unsigned i = 0; i != Scale; ++i) Mask.push_back(i); for (unsigned i = 0; i != Scale; ++i) Mask.push_back(NumElems+i); Mask.append(NumElems - Mask.size(), SM_SentinelUndef); for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. // TODO: Detect subvector broadcast here instead of DAG combine? static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); unsigned NumOperands = Op.getNumOperands(); unsigned NumFreezeUndef = 0; unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; if (ISD::isFreezeUndef(SubVec.getNode())) { // If the freeze(undef) has multiple uses then we must fold to zero. if (SubVec.hasOneUse()) ++NumFreezeUndef; else ++NumZero; } else if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. NonZeros |= 1 << i; ++NumNonZero; } } // If we have more than 2 non-zeros, build each half separately. if (NumNonZero > 2) { MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } // Otherwise, build it up through insert_subvectors. SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT)) : DAG.getUNDEF(ResVT)); MVT SubVT = Op.getOperand(0).getSimpleValueType(); unsigned NumSubElems = SubVT.getVectorNumElements(); for (unsigned i = 0; i != NumOperands; ++i) { if ((NonZeros & (1 << i)) == 0) continue; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i), DAG.getIntPtrConstant(i * NumSubElems, dl)); } return Vec; } // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); unsigned NumOperands = Op.getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); uint64_t Zeros = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) Zeros |= (uint64_t)1 << i; else NonZeros |= (uint64_t)1 << i; } unsigned NumElems = ResVT.getVectorNumElements(); // If we are inserting non-zero vector and there are zeros in LSBs and undef // in the MSBs we need to emit a KSHIFTL. The generic lowering to // insert_subvector will give us two kshifts. if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && Log2_64(NonZeros) != NumOperands - 1) { unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget); Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl); Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op, DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, DAG.getIntPtrConstant(0, dl)); } // If there are zero or one non-zeros we can handle this very simply. if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); if (!NonZeros) return Vec; unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); } if (NumOperands > 2) { MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getIntPtrConstant(NumElems/2, dl)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != i) return false; } return true; } /// Test whether there are elements crossing LaneSizeInBits lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef Mask) { assert(LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size"); int LaneSize = LaneSizeInBits / ScalarSizeInBits; int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); } /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come /// from multiple lanes - this is different to isLaneCrossingShuffleMask to /// better support 'repeated mask + lane permute' style shuffles. static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef Mask) { assert(LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size"); int NumElts = Mask.size(); int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits; int NumLanes = NumElts / NumEltsPerLane; if (NumLanes > 1) { for (int i = 0; i != NumLanes; ++i) { int SrcLane = -1; for (int j = 0; j != NumEltsPerLane; ++j) { int M = Mask[(i * NumEltsPerLane) + j]; if (M < 0) continue; int Lane = (M % NumElts) / NumEltsPerLane; if (SrcLane >= 0 && SrcLane != Lane) return true; SrcLane = Lane; } } } return false; } /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same /// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// suitable for use with existing 128-bit shuffles as entries from the second /// vector have been remapped to [LaneSize, 2*LaneSize). static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a shuffle mask is equivalent within each 128-bit lane. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { SmallVector RepeatedMask; return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } /// Test whether a shuffle mask is equivalent within each 256-bit lane. static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = LaneSizeInBits / EltSizeInBits; RepeatedMask.assign(LaneSize, SM_SentinelUndef); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] == SM_SentinelZero) { if (!isUndefOrZero(RepeatedMask[i % LaneSize])) return false; RepeatedMask[i % LaneSize] = SM_SentinelZero; continue; } if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Handle the in-lane shuffles by detecting if and when they repeat. Adjust // later vector indices to start at multiples of LaneSize instead of Size. int LaneM = Mask[i] / Size; int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize); if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(), Mask, RepeatedMask); } /// Checks whether the vector elements referenced by two shuffle masks are /// equivalent. static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx) { assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index"); if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) return false; switch (Op.getOpcode()) { case ISD::BUILD_VECTOR: // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. // TODO: Handle MaskSize != Op.getNumOperands()? if (MaskSize == (int)Op.getNumOperands() && MaskSize == (int)ExpectedOp.getNumOperands()) return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); break; case X86ISD::VBROADCAST: case X86ISD::VBROADCAST_LOAD: // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()? return (Op == ExpectedOp && (int)Op.getValueType().getVectorNumElements() == MaskSize); case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: case X86ISD::PACKSS: case X86ISD::PACKUS: // HOP(X,X) can refer to the elt from the lower/upper half of a lane. // TODO: Handle MaskSize != NumElts? // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { MVT VT = Op.getSimpleValueType(); int NumElts = VT.getVectorNumElements(); if (MaskSize == NumElts) { int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; int NumHalfEltsPerLane = NumEltsPerLane / 2; bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane); bool SameElt = (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane); return SameLane && SameElt; } } break; } return false; } /// Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(ArrayRef Mask, ArrayRef ExpectedMask, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; for (int i = 0; i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); int MaskIdx = Mask[i]; int ExpectedIdx = ExpectedMask[i]; if (0 <= MaskIdx && MaskIdx != ExpectedIdx) { SDValue MaskV = MaskIdx < Size ? V1 : V2; SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) return false; } } return true; } /// Checks whether a target shuffle mask is equivalent to an explicit pattern. /// /// The masks must be exactly the same width. /// /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in /// both, or via a known bits test. static bool isTargetShuffleEquivalent(MVT VT, ArrayRef Mask, ArrayRef ExpectedMask, const SelectionDAG &DAG, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; assert(llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"); // Check for out-of-range target shuffle mask indices. if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) return false; // Don't use V1/V2 if they're not the same size as the shuffle mask type. if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() || !V1.getValueType().isVector())) V1 = SDValue(); if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() || !V2.getValueType().isVector())) V2 = SDValue(); APInt ZeroV1 = APInt::getZero(Size); APInt ZeroV2 = APInt::getZero(Size); for (int i = 0; i < Size; ++i) { int MaskIdx = Mask[i]; int ExpectedIdx = ExpectedMask[i]; if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) continue; if (MaskIdx == SM_SentinelZero) { // If we need this expected index to be a zero element, then update the // relevant zero mask and perform the known bits at the end to minimize // repeated computes. SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; if (ExpectedV && Size == (int)ExpectedV.getValueType().getVectorNumElements()) { int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2; ZeroMask.setBit(BitIdx); continue; } } if (MaskIdx >= 0) { SDValue MaskV = MaskIdx < Size ? V1 : V2; SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) continue; } return false; } return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) && (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2)); } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT, const SelectionDAG &DAG) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; SmallVector Unpcklwd; createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, /* Unary = */ false); SmallVector Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) || isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG)); return IsUnpackwdMask; } static bool is128BitUnpackShuffleMask(ArrayRef Mask, const SelectionDAG &DAG) { // Create 128-bit vector type based on mask size. MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); MVT VT = MVT::getVectorVT(EltVT, Mask.size()); // We can't assume a canonical shuffle mask, so try the commuted version too. SmallVector CommutedMask(Mask); ShuffleVectorSDNode::commuteMask(CommutedMask); // Match any of unary/binary or low/high. for (unsigned i = 0; i != 4; ++i) { SmallVector UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) || isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG)) return true; } return false; } /// Return true if a shuffle mask chooses elements identically in its top and /// bottom halves. For example, any splat mask has the same top and bottom /// halves. If an element is undefined in only one half of the mask, the halves /// are not considered identical. static bool hasIdenticalHalvesShuffleMask(ArrayRef Mask) { assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); unsigned HalfSize = Mask.size() / 2; for (unsigned i = 0; i != HalfSize; ++i) { if (Mask[i] != Mask[i + HalfSize]) return false; } return true; } /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static unsigned getV4X86ShuffleImm(ArrayRef Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); // If the mask only uses one non-undef element, then fully 'splat' it to // improve later broadcast matching. int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"); int FirstElt = Mask[FirstIndex]; if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; })) return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt; unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; return Imm; } static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. // As described in computeZeroableShuffleElements function. // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } // Exit if the mask's non zero elements are not in increasing order. if (NextElement != Mask[i]) return false; NextElement++; } return true; } /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; const int NumEltBytes = VT.getScalarSizeInBits() / 8; assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())); SmallVector PSHUFBMask(NumBytes); // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); continue; } if (Zeroable[i / NumEltBytes]) { PSHUFBMask[i] = ZeroMask; continue; } // We can only use a single input of V1 or V2. SDValue SrcV = (M >= Size ? V2 : V1); if (V && V != SrcV) return SDValue(); V = SrcV; M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) return SDValue(); M = M % LaneSize; M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); unsigned NumElts = VT.getVectorNumElements(); assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"); SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; for (int i = 0; i != NumElts; i += 2) { int M1 = TargetMask[i + 0]; int M2 = TargetMask[i + 1]; Undef1 &= (SM_SentinelUndef == M1); Undef2 &= (SM_SentinelUndef == M2); Zero1 &= isUndefOrZero(M1); Zero2 &= isUndefOrZero(M2); } assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected"); // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } // If an unary shuffle, attempt to match as an unpack lo/hi with zero. if (IsUnary && (Zero1 || Zero2)) { // Don't bother if we can blend instead. if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) return false; bool MatchLo = true, MatchHi = true; for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { int M = TargetMask[i]; // Ignore if the input is known to be zero or the index is undef. if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || (M == SM_SentinelUndef)) continue; MatchLo &= (M == Unpckl[i]); MatchHi &= (M == Unpckh[i]); } if (MatchLo || MatchHi) { UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; return true; } } // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; } } return false; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SmallVector Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SmallVector Unpckh; createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) /// followed by unpack 256-bit. static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SmallVector Unpckl, Unpckh; createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true); createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false); unsigned UnpackOpcode; if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) UnpackOpcode = X86ISD::UNPCKL; else if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) UnpackOpcode = X86ISD::UNPCKH; else return SDValue(); // This is a "natural" unpack operation (rather than the 128-bit sectored // operation implemented by AVX). We need to rearrange 64-bit chunks of the // input in order to use the x86 instruction. V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1), DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3}); V1 = DAG.getBitcast(VT, V1); return DAG.getNode(UnpackOpcode, DL, VT, V1, V1); } // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the // source into the lower elements and zeroing the upper elements. static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget) { if (!VT.is512BitVector() && !Subtarget.hasVLX()) return false; unsigned NumElts = Mask.size(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { unsigned SrcEltBits = EltSizeInBits * Scale; if (SrcEltBits < 32 && !Subtarget.hasBWI()) continue; unsigned NumSrcElts = NumElts / Scale; if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale)) continue; unsigned UpperElts = NumElts - NumSrcElts; if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale); SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts); DstVT = MVT::getIntegerVT(EltSizeInBits); if ((NumSrcElts * EltSizeInBits) >= 128) { // ISD::TRUNCATE DstVT = MVT::getVectorVT(DstVT, NumSrcElts); } else { // X86ISD::VTRUNC DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits); } return true; } return false; } // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper // element padding to the final DstVT. static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers) { MVT SrcVT = Src.getSimpleValueType(); MVT DstSVT = DstVT.getScalarType(); unsigned NumDstElts = DstVT.getVectorNumElements(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits(); if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) return SDValue(); // Perform a direct ISD::TRUNCATE if possible. if (NumSrcElts == NumDstElts) return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src); if (NumSrcElts > NumDstElts) { MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits()); } if ((NumSrcElts * DstEltSizeInBits) >= 128) { MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, DstVT.getSizeInBits()); } // Non-VLX targets must truncate from a 512-bit type, so we need to // widen, truncate and then possibly extract the original subvector. if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) { SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512); return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers); } // Fallback to a X86ISD::VTRUNC, padding if necessary. MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits); SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src); if (DstVT != TruncVT) Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, DstVT.getSizeInBits()); return Trunc; } // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. // // An example is the following: // // t0: ch = EntryToken // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 // t25: v4i32 = truncate t2 // t41: v8i16 = bitcast t25 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 // t18: v2i64 = bitcast t51 // // One can just use a single vpmovdw instruction, without avx512vl we need to // use the zmm variant and extract the lower subvector, padding with zeroes. // TODO: Merge with lowerShuffleAsVTRUNC. static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"); if (!Subtarget.hasAVX512()) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { unsigned SrcEltBits = EltSizeInBits * Scale; unsigned NumSrcElts = NumElts / Scale; unsigned UpperElts = NumElts - NumSrcElts; if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; // Attempt to find a matching source truncation, but as a fall back VLX // cases can use the VPMOV directly. SDValue Src = peekThroughBitcasts(V1); if (Src.getOpcode() == ISD::TRUNCATE && Src.getScalarValueSizeInBits() == SrcEltBits) { Src = Src.getOperand(0); } else if (Subtarget.hasVLX()) { MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); Src = DAG.getBitcast(SrcVT, Src); // Don't do this if PACKSS/PACKUS could perform it cheaper. if (Scale == 2 && ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) || (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits))) return SDValue(); } else return SDValue(); // VPMOVWB is only available with avx512bw. if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32) return SDValue(); bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); } return SDValue(); } // Attempt to match binary shuffle patterns as a truncate. static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.is128BitVector() || VT.is256BitVector()) && "Unexpected VTRUNC type"); if (!Subtarget.hasAVX512()) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { // TODO: Support non-BWI VPMOVWB truncations? unsigned SrcEltBits = EltSizeInBits * Scale; if (SrcEltBits < 32 && !Subtarget.hasBWI()) continue; // Match shuffle // Bail if the V2 elements are undef. unsigned NumHalfSrcElts = NumElts / Scale; unsigned NumSrcElts = 2 * NumHalfSrcElts; for (unsigned Offset = 0; Offset != Scale; ++Offset) { if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) || isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts)) continue; // The elements beyond the truncation must be undef/zero. unsigned UpperElts = NumElts - NumSrcElts; if (UpperElts > 0 && !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; bool UndefUppers = UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts); // For offset truncations, ensure that the concat is cheap. if (Offset) { auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) { if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR && Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR) return Lo.getOperand(0) == Hi.getOperand(0); if (ISD::isNormalLoad(Lo.getNode()) && ISD::isNormalLoad(Hi.getNode())) { auto *LDLo = cast(Lo); auto *LDHi = cast(Hi); return DAG.areNonVolatileConsecutiveLoads( LDHi, LDLo, Lo.getValueType().getStoreSize(), 1); } return false; }; if (!IsCheapConcat(V1, V2)) continue; } // As we're using both sources then we need to concat them together // and truncate from the double-sized src. MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2); SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); Src = DAG.getBitcast(SrcVT, Src); // Shift the offset'd elements into place for the truncation. // TODO: Use getTargetVShiftByConstNode. if (Offset) Src = DAG.getNode( X86ISD::VSRLI, DL, SrcVT, Src, DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8)); return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); } } return SDValue(); } /// Check whether a compaction lowering can be done by dropping even/odd /// elements and compute how many times even/odd elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// (even) /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// (odd) /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even/odd elements must be dropped /// if there is such a number. Otherwise returns zero. static int canLowerByDroppingElements(ArrayRef Mask, bool MatchEven, bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; int Offset = MatchEven ? 0 : 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] < 0) continue; bool IsAnyViable = false; for (unsigned j = 0; j != std::size(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != std::size(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. // Checks for compaction shuffle masks if MaxStages > 1. // TODO: Add support for matching multiple PACKSS/PACKUS stages. static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages = 1) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && "Illegal maximum compaction"); auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) { unsigned NumSrcBits = PackVT.getScalarSizeInBits(); unsigned NumPackedBits = NumSrcBits - BitSize; N1 = peekThroughBitcasts(N1); N2 = peekThroughBitcasts(N2); unsigned NumBits1 = N1.getScalarValueSizeInBits(); unsigned NumBits2 = N2.getScalarValueSizeInBits(); bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false); bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false); if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) || (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits)) return false; if (Subtarget.hasSSE41() || BitSize == 8) { APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits); if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) && (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) { V1 = N1; V2 = N2; SrcVT = PackVT; PackOpcode = X86ISD::PACKUS; return true; } } bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false); bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false); if ((N1.isUndef() || IsZero1 || IsAllOnes1 || DAG.ComputeNumSignBits(N1) > NumPackedBits) && (N2.isUndef() || IsZero2 || IsAllOnes2 || DAG.ComputeNumSignBits(N2) > NumPackedBits)) { V1 = N1; V2 = N2; SrcVT = PackVT; PackOpcode = X86ISD::PACKSS; return true; } return false; }; // Attempt to match against wider and wider compaction patterns. for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) { MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages); MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages); // Try binary shuffle. SmallVector BinaryMask; createPackShuffleMask(VT, BinaryMask, false, NumStages); if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2)) if (MatchPACK(V1, V2, PackVT)) return true; // Try unary shuffle. SmallVector UnaryMask; createPackShuffleMask(VT, UnaryMask, true, NumStages); if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1)) if (MatchPACK(V1, V1, PackVT)) return true; } return false; } static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; unsigned SizeBits = VT.getSizeInBits(); unsigned EltBits = VT.getScalarSizeInBits(); unsigned MaxStages = Log2_32(64 / EltBits); if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, Subtarget, MaxStages)) return SDValue(); unsigned CurrentEltBits = PackVT.getScalarSizeInBits(); unsigned NumStages = Log2_32(CurrentEltBits / EltBits); // Don't lower multi-stage packs on AVX512, truncation is better. if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX()) return SDValue(); // Pack to the largest type possible: // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. unsigned MaxPackBits = 16; if (CurrentEltBits > 16 && (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41())) MaxPackBits = 32; // Repeatedly pack down to the target size. SDValue Res; for (unsigned i = 0; i != NumStages; ++i) { unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits); unsigned NumSrcElts = SizeBits / SrcEltBits; MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2); Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1), DAG.getBitcast(SrcVT, V2)); V1 = V2 = Res; CurrentEltBits /= 2; } assert(Res && Res.getValueType() == VT && "Failed to lower compaction shuffle"); return Res; } /// Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); SDValue Zero, AllOnes; // Use f64 if i64 isn't legal. if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { EltVT = MVT::f64; MaskVT = MVT::getVectorVT(EltVT, Mask.size()); } MVT LogicVT = VT; if (EltVT == MVT::f32 || EltVT == MVT::f64) { Zero = DAG.getConstantFP(0.0, DL, EltVT); APFloat AllOnesValue = APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT)); AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT); LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); } else { Zero = DAG.getConstant(0, DL, EltVT); AllOnes = DAG.getAllOnesConstant(DL, EltVT); } SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); VMask = DAG.getBitcast(LogicVT, VMask); V = DAG.getBitcast(LogicVT, V); SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); return getBitSelect(DL, VT, V1, V2, V1Mask, DAG); } static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG); static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch"); // For 32/64-bit elements, if we only reference one input (plus any undefs), // then ensure the blend mask part for that lane just references that input. bool ForceWholeLaneMasks = VT.is256BitVector() && VT.getScalarSizeInBits() >= 32; // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. for (int Lane = 0; Lane != NumLanes; ++Lane) { // Keep track of the inputs used per lane. bool LaneV1InUse = false; bool LaneV2InUse = false; uint64_t LaneBlendMask = 0; for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) { int Elt = (Lane * NumEltsPerLane) + LaneElt; int M = Mask[Elt]; if (M == SM_SentinelUndef) continue; if (M == Elt || (0 <= M && M < NumElts && IsElementEquivalent(NumElts, V1, V1, M, Elt))) { Mask[Elt] = Elt; LaneV1InUse = true; continue; } if (M == (Elt + NumElts) || (NumElts <= M && IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) { LaneBlendMask |= 1ull << LaneElt; Mask[Elt] = Elt + NumElts; LaneV2InUse = true; continue; } if (Zeroable[Elt]) { if (V1IsZeroOrUndef) { ForceV1Zero = true; Mask[Elt] = Elt; LaneV1InUse = true; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; LaneBlendMask |= 1ull << LaneElt; Mask[Elt] = Elt + NumElts; LaneV2InUse = true; continue; } } return false; } // If we only used V2 then splat the lane blend mask to avoid any demanded // elts from V1 in this lane (the V1 equivalent is implicit with a zero // blend mask bit). if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse) LaneBlendMask = (1ull << NumEltsPerLane) - 1; BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane); } return true; } /// Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector Mask(Original); if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); unsigned NumElts = VT.getVectorNumElements(); switch (VT.SimpleTy) { case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); [[fallthrough]]; case MVT::v4f64: case MVT::v8f32: assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); [[fallthrough]]; case MVT::v2f64: case MVT::v2i64: case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getTargetConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(BlendMask, DL, MVT::i8)); } // Use PBLENDW for lower/upper lanes and then blend lanes. // TODO - we should allow 2 PBLENDW here and leave shuffle combine to // merge to VSELECT where useful. uint64_t LoMask = BlendMask & 0xFF; uint64_t HiMask = (BlendMask >> 8) & 0xFF; if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(LoMask, DL, MVT::i8)); SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(HiMask, DL, MVT::i8)); return DAG.getVectorShuffle( MVT::v16i16, DL, Lo, Hi, {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); } [[fallthrough]]; } case MVT::v32i8: assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); [[fallthrough]]; case MVT::v16i8: { assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { MVT IntegerType = MVT::getIntegerVT(std::max(NumElts, 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } // If we have VPTERNLOG, we can use that as a bit blend. if (Subtarget.hasVLX()) if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return BitBlend; // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // x86 allows load folding with blendvb from the 2nd source operand. But // we are still using LLVM select here (see comment below), so that's V1. // If V2 can be load-folded and V1 cannot be load-folded, then commute to // allow that load-folding possibility. if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } case MVT::v16f32: case MVT::v8f64: case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; } // Otherwise load an immediate into a GPR, cast to k-register, and use a // masked move. MVT IntegerType = MVT::getIntegerVT(std::max(NumElts, 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); SmallVector PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] < 0) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } // If only immediate blends, then bail if the blend mask can't be widened to // i16. unsigned EltSize = VT.getScalarSizeInBits(); if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) return SDValue(); SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// Try to lower as an unpack of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; int NumHalfLaneElts = NumLaneElts / 2; bool MatchLo = true, MatchHi = true; SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; // Determine UNPCKL/UNPCKH type and operand order. for (int Elt = 0; Elt != NumElts; ++Elt) { int M = Mask[Elt]; if (M < 0) continue; // Normalize the mask value depending on whether it's V1 or V2. int NormM = M; SDValue &Op = Ops[Elt & 1]; if (M < NumElts && (Op.isUndef() || Op == V1)) Op = V1; else if (NumElts <= M && (Op.isUndef() || Op == V2)) { Op = V2; NormM -= NumElts; } else return SDValue(); bool MatchLoAnyLane = false, MatchHiAnyLane = false; for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid); MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi); if (MatchLoAnyLane || MatchHiAnyLane) { assert((MatchLoAnyLane ^ MatchHiAnyLane) && "Failed to match UNPCKLO/UNPCKHI"); break; } } MatchLo &= MatchLoAnyLane; MatchHi &= MatchHiAnyLane; if (!MatchLo && !MatchHi) return SDValue(); } assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"); // Element indices have changed after unpacking. Calculate permute mask // so that they will be put back to the position as dictated by the // original shuffle mask indices. SmallVector PermuteMask(NumElts, -1); for (int Elt = 0; Elt != NumElts; ++Elt) { int M = Mask[Elt]; if (M < 0) continue; int NormM = M; if (NumElts <= M) NormM -= NumElts; bool IsFirstOp = M < NumElts; int BaseMaskElt = NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts)); if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0])) PermuteMask[Elt] = BaseMaskElt; else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1])) PermuteMask[Elt] = BaseMaskElt + 1; assert(PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"); } unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); } /// Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); assert(Mask.size() >= 2 && "Single element masks are invalid."); // This routine only supports 128-bit integer dual input vectors. if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef()) return SDValue(); int NumLoInputs = count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](int ScalarSize, int Scale) { SmallVector V1Mask((unsigned)Size, -1); SmallVector V2Mask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) return Unpack; // If we're shuffling with a zero vector then we're better off not doing // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements. if (ISD::isBuildVectorAllZeros(V1.getNode()) || ISD::isBuildVectorAllZeros(V2.getNode())) return SDValue(); // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector PermMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || (VT.is256BitVector() && !Subtarget.hasAVX2()) || (VT.is512BitVector() && !Subtarget.hasBWI())) return SDValue(); // We don't currently support lane crossing permutes. if (is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); int Scale = VT.getScalarSizeInBits() / 8; int NumLanes = VT.getSizeInBits() / 128; int NumElts = VT.getVectorNumElements(); int NumEltsPerLane = NumElts / NumLanes; // Determine range of mask elts. bool Blend1 = true; bool Blend2 = true; std::pair Range1 = std::make_pair(INT_MAX, INT_MIN); std::pair Range2 = std::make_pair(INT_MAX, INT_MIN); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) { Blend1 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range1.first = std::min(Range1.first, M); Range1.second = std::max(Range1.second, M); } else { M -= NumElts; Blend2 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range2.first = std::min(Range2.first, M); Range2.second = std::max(Range2.second, M); } } } // Bail if we don't need both elements. // TODO - it might be worth doing this for unary shuffles if the permute // can be widened. if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || !(0 <= Range2.first && Range2.second < NumEltsPerLane)) return SDValue(); if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) return SDValue(); // Rotate the 2 ops so we can access both ranges, then permute the result. auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue Rotate = DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), DAG.getBitcast(ByteVT, Lo), DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); else PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); } } return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); }; // Check if the ranges are small enough to rotate from either direction. if (Range2.second < Range1.first) return RotateAndPermute(V1, V2, Range1.first, 0); if (Range1.second < Range2.first) return RotateAndPermute(V2, V1, Range2.first, NumElts); return SDValue(); } static bool isBroadcastShuffleMask(ArrayRef Mask) { return isUndefOrEqual(Mask, 0); } static bool isNoopOrBroadcastShuffleMask(ArrayRef Mask) { return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask); } /// Check if the Mask consists of the same element repeated multiple times. static bool isSingleElementRepeatedMask(ArrayRef Mask) { size_t NumUndefs = 0; std::optional UniqueElt; for (int Elt : Mask) { if (Elt == SM_SentinelUndef) { NumUndefs++; continue; } if (UniqueElt.has_value() && UniqueElt.value() != Elt) return false; UniqueElt = Elt; } // Make sure the element is repeated enough times by checking the number of // undefs is small. return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value(); } /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. static SDValue lowerShuffleAsDecomposedShuffleMerge( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; // Shuffle the input elements into the desired positions in V1 and V2 and // unpack/blend them together. bool IsAlternating = true; SmallVector V1Mask(NumElts, -1); SmallVector V2Mask(NumElts, -1); SmallVector FinalMask(NumElts, -1); for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; if (M >= 0 && M < NumElts) { V1Mask[i] = M; FinalMask[i] = i; IsAlternating &= (i & 1) == 0; } else if (M >= NumElts) { V2Mask[i] = M - NumElts; FinalMask[i] = i + NumElts; IsAlternating &= (i & 1) == 1; } } // If we effectively only demand the 0'th element of \p Input, and not only // as 0'th element, then broadcast said input, // and change \p InputMask to be a no-op (identity) mask. auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget, &DAG](SDValue &Input, MutableArrayRef InputMask) { unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 || !X86::mayFoldLoad(Input, Subtarget))) return; if (isNoopShuffleMask(InputMask)) return; assert(isBroadcastShuffleMask(InputMask) && "Expected to demand only the 0'th element."); Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input); for (auto I : enumerate(InputMask)) { int &InputMaskElt = I.value(); if (InputMaskElt >= 0) InputMaskElt = I.index(); } }; // Currently, we may need to produce one shuffle per input, and blend results. // It is possible that the shuffle for one of the inputs is already a no-op. // See if we can simplify non-no-op shuffles into broadcasts, // which we consider to be strictly better than an arbitrary shuffle. if (isNoopOrBroadcastShuffleMask(V1Mask) && isNoopOrBroadcastShuffleMask(V2Mask)) { canonicalizeBroadcastableInput(V1, V1Mask); canonicalizeBroadcastableInput(V2, V2Mask); } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as // the shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, a 2-input // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // Only prefer immediate blends to unpack/rotate. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; // If either input vector provides only a single element which is repeated // multiple times, unpacking from both input vectors would generate worse // code. e.g. for // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4 // it is better to process t4 first to create a vector of t4[0], then unpack // that vector with t2. if (!isSingleElementRepeatedMask(V1Mask) && !isSingleElementRepeatedMask(V2Mask)) if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; if (VT.getScalarSizeInBits() >= 32) if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( DL, VT, V1, V2, Mask, Subtarget, DAG)) return PermUnpack; } // If the final mask is an alternating blend of vXi8/vXi16, convert to an // UNPCKL(SHUFFLE, SHUFFLE) pattern. // TODO: It doesn't have to be alternating - but each lane mustn't have more // than half the elements coming from each source. if (IsAlternating && VT.getScalarSizeInBits() < 32) { V1Mask.assign(NumElts, -1); V2Mask.assign(NumElts, -1); FinalMask.assign(NumElts, -1); for (int i = 0; i != NumElts; i += NumEltsPerLane) for (int j = 0; j != NumEltsPerLane; ++j) { int M = Mask[i + j]; if (M >= 0 && M < NumElts) { V1Mask[i + (j / 2)] = M; FinalMask[i + j] = i + (j / 2); } else if (M >= NumElts) { V2Mask[i + (j / 2)] = M - NumElts; FinalMask[i + j] = i + (j / 2) + NumElts; } } } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, const X86Subtarget &Subtarget, ArrayRef Mask) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers"); // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; int MaxSubElts = 64 / EltSizeInBits; unsigned RotateAmt, NumSubElts; if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts, MaxSubElts, NumSubElts, RotateAmt)) return -1; unsigned NumElts = Mask.size(); MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); return RotateAmt; } /// Lower shuffle using X86ISD::VROTLI rotations. static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Only XOP + AVX512 targets have bit rotation instructions. // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this. bool IsLegal = (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512(); if (!IsLegal && Subtarget.hasSSE3()) return SDValue(); MVT RotateVT; int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(), Subtarget, Mask); if (RotateAmt < 0) return SDValue(); // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL, // expanded to OR(SRL,SHL), will be more efficient, but if they can // widen to vXi16 or more then existing lowering should will be better. if (!IsLegal) { if ((RotateAmt % 16) == 0) return SDValue(); // TODO: Use getTargetVShiftByConstNode. unsigned ShlAmt = RotateAmt; unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt; V1 = DAG.getBitcast(RotateVT, V1); SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1, DAG.getTargetConstant(ShlAmt, DL, MVT::i8)); SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1, DAG.getTargetConstant(SrlAmt, DL, MVT::i8)); SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL); return DAG.getBitcast(VT, Rot); } SDValue Rot = DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1), DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); return DAG.getBitcast(VT, Rot); } /// Try to match a vector shuffle as an element rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) continue; // Determine where a rotated vector would have started. int StartIdx = i - (M % NumElts); if (StartIdx == 0) // The identity rotation isn't interesting, stop. return -1; // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return -1; // Compute which value this mask is pointing at. SDValue MaskV = M < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return -1; } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; V1 = Lo; V2 = Hi; return Rotation; } /// Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask) { // Don't accept any shuffles with zero elements. if (isAnyZero(Mask)) return -1; // PALIGNR works on 128-bit lanes. SmallVector RepeatedMask; if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); // Cast the inputs to i8 vector of correct length to match PALIGNR or // PSLLDQ/PSRLDQ. MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); Lo = DAG.getBitcast(ByteVT, Lo); Hi = DAG.getBitcast(ByteVT, Hi); // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { assert((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } /// Try to lower a vector shuffle as a dword/qword rotation. /// /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary /// rotation of the concatenation of two vectors; This routine will /// try to generically lower a vector shuffle through such an pattern. /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); // 128/256-bit vectors are only supported with VLX. assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask); if (0 < Rotation) return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, DAG.getTargetConstant(Rotation, DL, MVT::i8)); // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ. // TODO: Pull this out as a matchShuffleAsElementShift helper? // TODO: We can probably make this more aggressive and use shift-pairs like // lowerShuffleAsByteShiftMask. unsigned NumElts = Mask.size(); unsigned ZeroLo = Zeroable.countr_one(); unsigned ZeroHi = Zeroable.countl_one(); assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected"); if (!ZeroLo && !ZeroHi) return SDValue(); if (ZeroLo) { SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2; int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts; if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low)) return DAG.getNode(X86ISD::VALIGN, DL, VT, Src, getZeroVector(VT, Subtarget, DAG, DL), DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8)); } if (ZeroHi) { SDValue Src = Mask[0] < (int)NumElts ? V1 : V2; int Low = Mask[0] < (int)NumElts ? 0 : NumElts; if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi)) return DAG.getNode(X86ISD::VALIGN, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), Src, DAG.getTargetConstant(ZeroHi, DL, MVT::i8)); } return SDValue(); } /// Try to lower a vector shuffle as a byte shift sequence. static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(VT.is128BitVector() && "Only 128-bit vectors supported"); // We need a shuffle that has zeros at one/both ends and a sequential // shuffle from one source within. unsigned ZeroLo = Zeroable.countr_one(); unsigned ZeroHi = Zeroable.countl_one(); if (!ZeroLo && !ZeroHi) return SDValue(); unsigned NumElts = Mask.size(); unsigned Len = NumElts - (ZeroLo + ZeroHi); if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) return SDValue(); unsigned Scale = VT.getScalarSizeInBits() / 8; ArrayRef StubMask = Mask.slice(ZeroLo, Len); if (!isUndefOrInRange(StubMask, 0, NumElts) && !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) return SDValue(); SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; Res = DAG.getBitcast(MVT::v16i8, Res); // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an // inner sequential set of elements, possibly offset: // 01234567 --> zzzzzz01 --> 1zzzzzzz // 01234567 --> 4567zzzz --> zzzzz456 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz if (ZeroLo == 0) { unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); } else if (ZeroHi == 0) { unsigned Shift = Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else if (!Subtarget.hasSSSE3()) { // If we don't have PSHUFB then its worth avoiding an AND constant mask // by performing 3 byte shifts. Shuffle combining can kick in above that. // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Shift += Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else return SDValue(); return DAG.getBitcast(VT, Res); } /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) return -1; } int ShiftEltBits = ScalarSizeInBits * Scale; bool ByteShift = ShiftEltBits > 64; Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); return (int)ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) { int ShiftAmt = MatchShift(Shift, Scale, Left); if (0 < ShiftAmt) return ShiftAmt; } // no match return -1; } static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); MVT ShiftVT; SDValue V = V1; unsigned Opcode; // Try to match shuffle against V1 shift. int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, Size, Zeroable, Subtarget); V = V2; } if (ShiftAmt < 0) return SDValue(); if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ)) return SDValue(); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return false; if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return false; } if (!Src || Idx < 0) return false; assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Src; return true; } // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Base; V2 = Insert; return true; } } return false; } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return SDValue(); } /// Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid a/zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = DAG.getBitcast(VT, InputV); InputV = ShuffleOffset(InputV); InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); InputV = DAG.getBitcast(VT, InputV); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getTargetConstant(EltBits, DL, MVT::i8), DAG.getTargetConstant(LoIdx, DL, MVT::i8))); if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getTargetConstant(EltBits, DL, MVT::i8), DAG.getTargetConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); if ((i % Scale == 0 && SafeOffset(Idx))) { PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8); continue; } PSHUFBMask[i] = AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M < 0) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); V = peekThroughBitcasts(V); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getBitcast(EltVT, S); } return SDValue(); } /// Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { return V->hasOneUse() && ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } template static bool isSoftF16(T VT, const X86Subtarget &Subtarget) { T EltVT = VT.getScalarType(); return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16()); } /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltBits = VT.getScalarSizeInBits(); if (isSoftF16(EltVT, Subtarget)) return SDValue(); int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr; bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Bail if a non-zero V1 isn't used in place. if (!IsV1Zeroable) { SmallVector V1Mask(Mask); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) { // Using zext to expand a narrow element won't work for non-zero // insertions. But we can use a masked constant vector if we're // inserting V2 into the bottom of V1. if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0)) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); // If we're inserting into a constant, mask off the inserted index // and OR with the zero-extended scalar. if (!IsV1Zeroable) { SmallVector Bits(NumElts, APInt::getAllOnes(EltBits)); Bits[V2Index] = APInt::getZero(EltBits); SDValue BitMask = getConstVector(Bits, VT, DAG, DL); V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask); V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2)); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD, MOVSS or MOVSH. unsigned MovOpc = 0; if (EltVT == MVT::f16) MovOpc = X86ISD::MOVSH; else if (EltVT == MVT::f32) MovOpc = X86ISD::MOVSS; else if (EltVT == MVT::f64) MovOpc = X86ISD::MOVSD; else llvm_unreachable("Unsupported floating point element type to handle!"); return DAG.getNode(MovOpc, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || NumElts <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v16i8, V2, DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8)); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); MVT EltVT = VT.getVectorElementType(); MVT V0VT = V0.getSimpleValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); MVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8)); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } /// If we are extracting two 128-bit halves of a vector and shuffling the /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a /// multi-shuffle lowering. static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef Mask, SelectionDAG &DAG) { MVT VT = N0.getSimpleValueType(); assert((VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"); // Check that both sources are extracts of the same source vector. if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || N0.getOperand(0) != N1.getOperand(0) || !N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SDValue WideVec = N0.getOperand(0); MVT WideVT = WideVec.getSimpleValueType(); if (!WideVT.is256BitVector()) return SDValue(); // Match extracts of each half of the wide source vector. Commute the shuffle // if the extract of the low half is N1. unsigned NumElts = VT.getVectorNumElements(); SmallVector NewMask(Mask); const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); if (ExtIndex1 == 0 && ExtIndex0 == NumElts) ShuffleVectorSDNode::commuteMask(NewMask); else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) return SDValue(); // Final bailout: if the mask is simple, we are better off using an extract // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps // because that avoids a constant load from memory. if (NumElts == 4 && (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG))) return SDValue(); // Extend the shuffle mask with undef elements. NewMask.append(NumElts, -1); // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask); // This is free: ymm -> xmm. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, DAG.getIntPtrConstant(0, DL)); } /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT EltVT = VT.getVectorElementType(); if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16)))) return SDValue(); // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. int BroadcastIdx = getSplatIndex(Mask); if (BroadcastIdx < 0) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. // TODO: Combine this logic with findEltLoadSrc() used by // EltsFromConsecutiveLoads(). int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { int OpBitWidth = V.getOperand(0).getValueSizeInBits(); int OpIdx = BitOffset / OpBitWidth; V = V.getOperand(OpIdx); BitOffset %= OpBitWidth; continue; } case ISD::EXTRACT_SUBVECTOR: { // The extraction index adds to the existing offset. unsigned EltBitWidth = V.getScalarValueSizeInBits(); unsigned Idx = V.getConstantOperandVal(1); unsigned BeginOffset = Idx * EltBitWidth; BitOffset += BeginOffset; V = V.getOperand(0); continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); int EltBitWidth = VOuter.getScalarValueSizeInBits(); int Idx = (int)V.getConstantOperandVal(2); int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); int BeginOffset = Idx * EltBitWidth; int EndOffset = BeginOffset + NumSubElts * EltBitWidth; if (BeginOffset <= BitOffset && BitOffset < EndOffset) { BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; } continue; } } break; } assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); BroadcastIdx = BitOffset / NumEltBits; // Do we need to bitcast the source to retrieve the original broadcast index? bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // If the original value has a larger element type than the shuffle, the // broadcast element is in essence truncated. Make that explicit to ease // folding. if (BitCastSrc && VT.isInteger()) if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; // Also check the simpler case, where we can directly reuse the scalar. if (!BitCastSrc && ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (ISD::isNormalLoad(V.getNode()) && cast(V)->isSimple()) { // We do not check for one-use of the vector load because a broadcast load // is expected to be a win for code size, register pressure, and possibly // uops even if the original vector load is not eliminated. // Reduce the vector load and shuffle to a broadcasted scalar load. LoadSDNode *Ld = cast(V); SDValue BaseAddr = Ld->getOperand(1); MVT SVT = VT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, TypeSize::getFixed(Offset), DL); // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather // than MOVDDUP. // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX? if (Opcode == X86ISD::VBROADCAST) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {Ld->getChain(), NewAddr}; V = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); return DAG.getBitcast(VT, V); } assert(SVT == MVT::f64 && "Unexpected VT!"); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); // Only broadcast the zero-element of a 128-bit subvector. if ((BitOffset % 128) != 0) return SDValue(); assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); V = extract128BitVector(V, ExtractIdx, DAG, DL); } // On AVX we can use VBROADCAST directly for scalar sources. if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) { V = DAG.getBitcast(MVT::f64, V); if (Subtarget.hasAVX()) { V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V); return DAG.getBitcast(VT, V); } V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V); } // If this is a scalar, do the broadcast on this type and bitcast. if (!V.getValueType().isVector()) { assert(V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"); MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(), VT.getVectorNumElements()); return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. if (V.getValueSizeInBits() > 128) V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); // Otherwise cast V to a vector with the same element type as VT, but // possibly narrower than VT. Then perform the broadcast. unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts); return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Attempt to match INSERTPS with one element from VA or VB being // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask // are updated. auto matchAsInsertPS = [&](SDValue VA, SDValue VB, ArrayRef CandidateMask) { unsigned ZMask = 0; int VADstIndex = -1; int VBDstIndex = -1; bool VAUsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any VA inputs in place. if (i == CandidateMask[i]) { VAUsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (VADstIndex >= 0 || VBDstIndex >= 0) return false; if (CandidateMask[i] < 4) { // VA input out of place for insertion. VADstIndex = i; } else { // VB input for insertion. VBDstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (VADstIndex < 0 && VBDstIndex < 0) return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned VBSrcIndex = 0; if (VADstIndex >= 0) { // If we have a VA input out of place, we use VA as the V2 element // insertion and don't use the original V2 at all. VBSrcIndex = CandidateMask[VADstIndex]; VBDstIndex = VADstIndex; VB = VA; } else { VBSrcIndex = CandidateMask[VBDstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!VAUsedInPlace) VA = DAG.getUNDEF(MVT::v4f32); // Update V1, V2 and InsertPSMask accordingly. V1 = VA; V2 = VB; // Insert the V2 element into the desired position. InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); return true; }; if (matchAsInsertPS(V1, V2, Mask)) return true; // Commute and try again. SmallVector CommutedMask(Mask); ShuffleVectorSDNode::commuteMask(CommutedMask); if (matchAsInsertPS(V2, V1, CommutedMask)) return true; return false; } static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask = 0; if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) || isShuffleEquivalent(Mask, {1, 3}, V1, V2)) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), Mask[1] < 0 ? -1 : (Mask[1] * 2), Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; SmallVector NewMask(Mask); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] < 0) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } else if (NumV2Elements == 3) { // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but // we can get here due to other paths (e.g repeated mask matching) that we // don't want to do another round of lowerVECTOR_SHUFFLE. ShuffleVectorSDNode::commuteMask(NewMask); return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget.hasSSE3()) { if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid // in SSE1 because otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2)) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasSSE2()) if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) { ZExt = DAG.getBitcast(MVT::v4f32, ZExt); return ZExt; } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use low/high mov instructions. These are only valid in SSE1 because // otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2)) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); // Try to use shift instructions if fast. if (Subtarget.preferLowerShuffleAsShift()) { if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ true)) return Shift; if (NumV2Elements == 0) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) return Rotate; } if (NumV2Elements == 0) { // Try to use broadcast unless the mask only has one non-undef element. if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; } // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2)) Mask = UnpackLoMask; else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2)) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (!isSingleSHUFPSMask(Mask)) { // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Unpack; } // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); return DAG.getBitcast(MVT::v4i32, ShufPS); } /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); // Attempt to directly match PSHUFLW or PSHUFHW. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); } if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { for (int i = 0; i != 4; ++i) HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); } SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(llvm::unique(LoInputs), LoInputs.end()); SmallVector HiInputs; copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(llvm::unique(HiInputs), HiInputs.end()); int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // If we are shuffling values from one half - check how many different DWORD // pairs we need to create. If only 1 or 2 then we can perform this as a // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. auto ShuffleDWordPairs = [&](ArrayRef PSHUFHalfMask, ArrayRef PSHUFDMask, unsigned ShufWOp) { V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); return DAG.getBitcast(VT, V); }; if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { int PSHUFDMask[4] = { -1, -1, -1, -1 }; SmallVector, 4> DWordPairs; int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); // Collect the different DWORD pairs. for (int DWord = 0; DWord != 4; ++DWord) { int M0 = Mask[2 * DWord + 0]; int M1 = Mask[2 * DWord + 1]; M0 = (M0 >= 0 ? M0 % 4 : M0); M1 = (M1 >= 0 ? M1 % 4 : M1); if (M0 < 0 && M1 < 0) continue; bool Match = false; for (int j = 0, e = DWordPairs.size(); j < e; ++j) { auto &DWordPair = DWordPairs[j]; if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); PSHUFDMask[DWord] = DOffset + j; Match = true; break; } } if (!Match) { PSHUFDMask[DWord] = DOffset + DWordPairs.size(); DWordPairs.push_back(std::make_pair(M0, M1)); } } if (DWordPairs.size() <= 2) { DWordPairs.resize(2, std::make_pair(-1, -1)); int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, DWordPairs[1].first, DWordPairs[1].second}; if ((NumHToL + NumHToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); if ((NumLToL + NumLToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); } } // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) + llvm::count(AToBInputs, 2 * ADWord + 1); int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) + llvm::count(BToBInputs, 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode( FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M >= 0 && M == FixIdx) M = FixFreeIdx; else if (M >= 0 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M >= 0 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M >= 0 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef IncomingInputs, ArrayRef ExistingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, MutableArrayRef FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef SourceHalfMask, int Word) { return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] < 0) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] < 0) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the /// blend if only one input is used. static SDValue lowerShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { assert(!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"); int NumBytes = VT.getSizeInBits() / 8; int Size = Mask.size(); int Scale = NumBytes / Size; SmallVector V1Mask(NumBytes, DAG.getUNDEF(MVT::i8)); SmallVector V2Mask(NumBytes, DAG.getUNDEF(MVT::i8)); V1InUse = false; V2InUse = false; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Scale]; if (M < 0) continue; const int ZeroMask = 0x80; int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask; int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale; if (Zeroable[i / Scale]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes); if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1), DAG.getBuildVector(ShufVT, DL, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2), DAG.getBuildVector(ShufVT, DL, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; if (V1InUse && V2InUse) V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2); else V = V1InUse ? V1 : V2; // Cast the result back to the correct type. return DAG.getBitcast(VT, V); } /// Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Try to use lower using a truncation. if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use bit rotation instructions. if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, Subtarget, DAG)) return Rotate; // Make a copy of the mask so it can be modified. SmallVector MutableMask(Mask); return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, Subtarget, DAG); } assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles."); // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG)) return V; // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use lower using a truncation. if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; // Try to use byte shift instructions to mask. if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW. int NumEvenDrops = canLowerByDroppingElements(Mask, true, false); if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) && !Subtarget.hasVLX()) { // Check if this is part of a 256-bit vector truncation. unsigned PackOpc = 0; if (NumEvenDrops == 2 && Subtarget.hasAVX2() && peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR && peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) { SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL); V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2, getZeroVector(MVT::v16i16, Subtarget, DAG, DL), DAG.getTargetConstant(0xEE, DL, MVT::i8)); V1V2 = DAG.getBitcast(MVT::v8i32, V1V2); V1 = extract128BitVector(V1V2, 0, DAG, DL); V2 = extract128BitVector(V1V2, 4, DAG, DL); PackOpc = X86ISD::PACKUS; } else if (Subtarget.hasSSE41()) { SmallVector DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32)); for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1)) DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32); SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps); V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1), DWordClearMask); V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2), DWordClearMask); PackOpc = X86ISD::PACKUS; } else if (!Subtarget.hasSSSE3()) { SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8); V1 = DAG.getBitcast(MVT::v4i32, V1); V2 = DAG.getBitcast(MVT::v4i32, V2); V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt); V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt); V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt); V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt); PackOpc = X86ISD::PACKSS; } if (PackOpc) { // Now pack things back together. SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2); if (NumEvenDrops == 2) { Result = DAG.getBitcast(MVT::v4i32, Result); Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result); } return Result; } } // When compacting odd (upper) elements, use PACKSS pre-SSE41. int NumOddDrops = canLowerByDroppingElements(Mask, false, false); if (NumOddDrops == 1) { bool HasSSE41 = Subtarget.hasSSE41(); V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1), DAG.getTargetConstant(16, DL, MVT::i8)); V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2), DAG.getTargetConstant(16, DL, MVT::i8)); return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL, MVT::v8i16, V1, V2); } // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. if (!IsBlendSupported && Subtarget.hasSSSE3()) { bool V1InUse, V2InUse; return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); } // We can always bit-blend if we have to so the fallback strategy is to // decompose into single-input permutes and blends/unpacks. return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG); } /// Lower 8-lane 16-bit floating point shuffles. static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; }); if (Subtarget.hasFP16()) { if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; } if (NumV2Elements == 1 && Mask[0] >= 8) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; } V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(MVT::v8f16, DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask)); } // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets, // sub-512-bit shuffles are padded to 512-bits for the shuffle and then // the active subvector is extracted. static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT MaskVT = VT.changeTypeToInteger(); SDValue MaskNode; MVT ShuffleVT = VT; if (!VT.is512BitVector() && !Subtarget.hasVLX()) { V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512); V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512); ShuffleVT = V1.getSimpleValueType(); // Adjust mask to correct indices for the second input. int NumElts = VT.getVectorNumElements(); unsigned Scale = 512 / VT.getSizeInBits(); SmallVector AdjustedMask(Mask); for (int &M : AdjustedMask) if (NumElts <= M) M += (Scale - 1) * NumElts; MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true); MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512); } else { MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true); } SDValue Result; if (V2.isUndef()) Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1); else Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2); if (VT != ShuffleVT) Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits()); return Result; } /// Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use a zext lowering. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Try to use lower using a truncation. if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // See if we can use SSE4A Extraction / Insertion. if (Subtarget.hasSSE4A()) if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG)) return V; int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use bit rotation instructions. if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef Mask) { for (int i = 0; i < 16; i += 2) if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) return false; return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector LoInputs; copy_if(Mask, std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(llvm::unique(LoInputs), LoInputs.end()); SmallVector HiInputs; copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(llvm::unique(HiInputs), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] >= 0) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. bool EvenInUse = false, OddInUse = false; for (int i = 0; i < 16; i += 2) { EvenInUse |= (Mask[i + 0] >= 0); OddInUse |= (Mask[i + 1] >= 0); if (EvenInUse && OddInUse) break; } V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8), OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8)); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) { int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); if (PostDupI16Shuffle[i / 2] < 0) PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; // Try to use byte shift instructions to mask. if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Check for compaction patterns. bool IsSingleInput = V2.isUndef(); int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput); // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input // lowerings can find an instruction sequence that is faster than a PSHUFB, we // want to preserve that and we can DAG combine any longer sequences into // a PSHUFB in the end. But once we start blending from multiple inputs, // the complexity of DAG combining bad patterns back into PSHUFB is too high, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // // If the mask is a binary compaction, we can more efficiently perform this // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()). // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) { bool V1InUse = false; bool V2InUse = false; SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); // If both V1 and V2 are in use and we can use a direct blend or an unpack, // do so. This avoids using them to handle blends-with-zero which is // important as a single pshufb is significantly faster for that. if (V1InUse && V2InUse) { if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // We can use an unpack to do the blending rather than an or in some // cases. Even though the or may be (very minorly) more efficient, we // preference this lowering because there are common cases where part of // the complexity of the shuffles goes away when we do the final blend as // an unpack. // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Unpack; // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget, DAG); // If we have XOP we can use one VPPERM instead of multiple PSHUFBs. if (Subtarget.hasXOP()) { SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true); return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode); } // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute( DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return V; } return PSHUFB; } // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) return Blend; // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. // // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. if (NumEvenDrops) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); SmallVector WordClearOps(8, DAG.getConstant(0, DL, MVT::i16)); for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1)) WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16); SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps); V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1), WordClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2), WordClearMask); // Now pack things back together. SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, IsSingleInput ? V1 : V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } return Result; } int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput); if (NumOddDrops == 1) { V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1), DAG.getTargetConstant(8, DL, MVT::i8)); if (!IsSingleInput) V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2), DAG.getTargetConstant(8, DL, MVT::i8)); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, IsSingleInput ? V1 : V2); } // Handle multi-input cases by blending/unpacking single-input shuffles. if (NumV2Elements > 0) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 // vectors with unpacks, shuffles those, and then pulls them back together // with a pack. SDValue V = V1; std::array LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; std::array HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; for (int i = 0; i < 16; ++i) if (Mask[i] >= 0) (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; SDValue VLoHalf, VHiHalf; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) && none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. VLoHalf = DAG.getBitcast(MVT::v8i16, V); VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, DAG.getConstant(0x00FF, DL, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke VHiHalf. VHiHalf = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into VLoHalf. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into VLoHalf and the high half into // VHiHalf so that we can blend them as i16s. SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); VLoHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); VHiHalf = DAG.getBitcast( MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (VT == MVT::v8bf16) { V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f16: return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// Generic routine to split vector shuffle into half-sized shuffles. /// /// This routine just extracts two subvectors, shuffles them independently, and /// then concatenates them back together. This should work effectively with all /// AVX vector shuffle types. static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, bool SimpleOnly) { assert(VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements); // Use splitVector/extractSubVector so that split build-vectors just build two // narrower build vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { SDValue LoV, HiV; std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL); return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); }; SDValue LoV1, HiV1, LoV2, HiV2; std::tie(LoV1, HiV1) = SplitVector(V1); std::tie(LoV2, HiV2) = SplitVector(V2); // Now create two 4-way blends of these half-width vectors. auto GetHalfBlendPiecesReq = [&](const ArrayRef &HalfMask, bool &UseLoV1, bool &UseHiV1, bool &UseLoV2, bool &UseHiV2) { UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false; for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { if (M >= NumElements + SplitNumElements) UseHiV2 = true; else UseLoV2 = true; } else if (M >= 0) { if (M >= SplitNumElements) UseHiV1 = true; else UseLoV1 = true; } } }; auto CheckHalfBlendUsable = [&](const ArrayRef &HalfMask) -> bool { if (!SimpleOnly) return true; bool UseLoV1, UseHiV1, UseLoV2, UseHiV2; GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2); return !(UseHiV1 || UseHiV2); }; auto HalfBlend = [&](ArrayRef HalfMask) { SmallVector V1BlendMask((unsigned)SplitNumElements, -1); SmallVector V2BlendMask((unsigned)SplitNumElements, -1); SmallVector BlendMask((unsigned)SplitNumElements, -1); for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { V2BlendMask[i] = M - NumElements; BlendMask[i] = SplitNumElements + i; } else if (M >= 0) { V1BlendMask[i] = M; BlendMask[i] = i; } } bool UseLoV1, UseHiV1, UseLoV2, UseHiV2; GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2); // Because the lowering happens after all combining takes place, we need to // manually combine these blend masks as much as possible so that we create // a minimal number of high-level vector shuffle nodes. assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple"); // First try just blending the halves of V1 or V2. if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) return DAG.getUNDEF(SplitVT); if (!UseLoV2 && !UseHiV2) return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); if (!UseLoV1 && !UseHiV1) return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); SDValue V1Blend, V2Blend; if (UseLoV1 && UseHiV1) { V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); } else { // We only use half of V1 so map the usage down into the final blend mask. V1Blend = UseLoV1 ? LoV1 : HiV1; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); } if (UseLoV2 && UseHiV2) { V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); } else { // We only use half of V2 so map the usage down into the final blend mask. V2Blend = UseLoV2 ? LoV2 : HiV2; for (int i = 0; i < SplitNumElements; ++i) if (BlendMask[i] >= SplitNumElements) BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); } return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask)) return SDValue(); SDValue Lo = HalfBlend(LoMask); SDValue Hi = HalfBlend(HiMask); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } /// Either split a vector in halves or decompose the shuffles and the /// blend/unpack. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select /// between splitting the shuffle into 128-bit components and stitching those /// back together vs. extracting the single-input shuffles and blending those /// results. static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself."); int Size = Mask.size(); // If this can be modeled as a broadcast of two elements followed by a blend, // prefer that lowering. This is especially important because broadcasts can // often fold with memory operands. auto DoBothBroadcast = [&] { int V1BroadcastIdx = -1, V2BroadcastIdx = -1; for (int M : Mask) if (M >= Size) { if (V2BroadcastIdx < 0) V2BroadcastIdx = M - Size; else if (M - Size != V2BroadcastIdx) return false; } else if (M >= 0) { if (V1BroadcastIdx < 0) V1BroadcastIdx = M; else if (M != V1BroadcastIdx) return false; } return true; }; if (DoBothBroadcast()) return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to // unusually few instructions. int LaneCount = VT.getSizeInBits() / 128; int LaneSize = Size / LaneCount; SmallBitVector LaneInputs[2]; LaneInputs[0].resize(LaneCount, false); LaneInputs[1].resize(LaneCount, false); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This // requires that the decomposed single-input shuffles don't end up here. return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, DAG); } // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Extend to support v8f32 (+ 512-bit shuffles). static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT == MVT::v4f64 && "Only for v4f64 shuffles"); int LHSMask[4] = {-1, -1, -1, -1}; int RHSMask[4] = {-1, -1, -1, -1}; unsigned SHUFPMask = 0; // As SHUFPD uses a single LHS/RHS element per lane, we can always // perform the shuffle once the lanes have been shuffled in place. for (int i = 0; i != 4; ++i) { int M = Mask[i]; if (M < 0) continue; int LaneBase = i & ~1; auto &LaneMask = (i & 1) ? RHSMask : LHSMask; LaneMask[LaneBase + (M & 1)] = M; SHUFPMask |= (M & 1) << i; } SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask); SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask); return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS, DAG.getTargetConstant(SHUFPMask, DL, MVT::i8)); } /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a lane permutation followed by a per-lane permutation. /// /// This is mainly for cases where we can have non-repeating permutes /// in each lane. /// /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, /// we should investigate merging them. static SDValue lowerShuffleAsLanePermuteAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef(); /// Attempts to find a sublane permute with the given size /// that gets all elements into their target lanes. /// /// If successful, fills CrossLaneMask and InLaneMask and returns true. /// If unsuccessful, returns false and may overwrite InLaneMask. auto getSublanePermute = [&](int NumSublanes) -> SDValue { int NumSublanesPerLane = NumSublanes / NumLanes; int NumEltsPerSublane = NumElts / NumSublanes; SmallVector CrossLaneMask; SmallVector InLaneMask(NumElts, SM_SentinelUndef); // CrossLaneMask but one entry == one sublane. SmallVector CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; if (M < 0) continue; int SrcSublane = M / NumEltsPerSublane; int DstLane = i / NumEltsPerLane; // We only need to get the elements into the right lane, not sublane. // So search all sublanes that make up the destination lane. bool Found = false; int DstSubStart = DstLane * NumSublanesPerLane; int DstSubEnd = DstSubStart + NumSublanesPerLane; for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) { if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane)) continue; Found = true; CrossLaneMaskLarge[DstSublane] = SrcSublane; int DstSublaneOffset = DstSublane * NumEltsPerSublane; InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane; break; } if (!Found) return SDValue(); } // Fill CrossLaneMask using CrossLaneMaskLarge. narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask); if (!CanUseSublanes) { // If we're only shuffling a single lowest lane and the rest are identity // then don't bother. // TODO - isShuffleMaskInputInPlace could be extended to something like // this. int NumIdentityLanes = 0; bool OnlyShuffleLowestLane = true; for (int i = 0; i != NumLanes; ++i) { int LaneOffset = i * NumEltsPerLane; if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane, i * NumEltsPerLane)) NumIdentityLanes++; else if (CrossLaneMask[LaneOffset] != 0) OnlyShuffleLowestLane = false; } if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) return SDValue(); } // Avoid returning the same shuffle operation. For example, // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, // undef:v16i16 if (CrossLaneMask == Mask || InLaneMask == Mask) return SDValue(); SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), InLaneMask); }; // First attempt a solution with full lanes. if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes)) return V; // The rest of the solutions use sublanes. if (!CanUseSublanes) return SDValue(); // Then attempt a solution with 64-bit sublanes (vpermq). if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2)) return V; // If that doesn't work and we have fast variable cross-lane shuffle, // attempt 32-bit sublanes (vpermd). if (!Subtarget.hasFastVariableCrossLaneShuffle()) return SDValue(); return getSublanePermute(/*NumSublanes=*/NumLanes * 4); } /// Helper to get compute inlane shuffle mask for a complete shuffle mask. static void computeInLaneShuffleMask(const ArrayRef &Mask, int LaneSize, SmallVector &InLaneMask) { int Size = Mask.size(); InLaneMask.assign(Mask.begin(), Mask.end()); for (int i = 0; i < Size; ++i) { int &M = InLaneMask[i]; if (M < 0) continue; if (((M % Size) / LaneSize) != (i / LaneSize)) M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; } } /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one /// source with a lane permutation. /// /// This lowering strategy results in four instructions in the worst case for a /// single-input cross lane shuffle which is lower than any other fully general /// cross-lane shuffle strategy I'm aware of. Special cases for each particular /// shuffle pattern should be handled prior to trying this lowering. static SDValue lowerShuffleAsLanePermuteAndShuffle( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); int LaneSize = Size / 2; // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // Only do this if the elements aren't all from the lower lane, // otherwise we're (probably) better off doing a split. if (VT == MVT::v4f64 && !all_of(Mask, [LaneSize](int M) { return M < LaneSize; })) return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG); // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. bool AllLanes; if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize)) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; AllLanes = LaneCrossing[0] && LaneCrossing[1]; } else { bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) LaneUsed[(Mask[i] % Size) / LaneSize] = true; AllLanes = LaneUsed[0] && LaneUsed[1]; } // TODO - we could support shuffling V2 in the Flipped input. assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); SmallVector InLaneMask; computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask); assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && "In-lane shuffle mask expected"); // If we're not using both lanes in each lane and the inlane mask is not // repeating, then we're better off splitting. if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask)) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1}); Flipped = DAG.getBitcast(VT, Flipped); return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask); } /// Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (V2.isUndef()) { // Attempt to match VBROADCAST*128 subvector broadcast load. bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1); bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1); if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() && X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) { MVT MemVT = VT.getHalfNumVectorElementsVT(); unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize(); auto *Ld = cast(peekThroughOneUseBitcasts(V1)); if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, VT, MemVT, Ld, Ofs, DAG)) return BcstLd; } // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. if (Subtarget.hasAVX2()) return SDValue(); } bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; bool IsHighZero = (Zeroable & 0xc) == 0xc; // Try to use an insert into a zero vector. if (WidenedMask[0] == 0 && IsHighZero) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), LoV, DAG.getIntPtrConstant(0, DL)); } // TODO: If minimizing size and one of the inputs is a zero vector and the // the zero vector has only one use, we could use a VPERM2X128 to save the // instruction bytes needed to explicitly generate the zero vector. // Blends are faster and handle all the non-lane-crossing cases. if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. if (!IsLowZero && !IsHighZero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2); if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) { // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. if (!isa(peekThroughBitcasts(V1))) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(2, DL)); } } // Try to use SHUF128 if possible. if (Subtarget.hasVLX()) { if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { unsigned PermMask = ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } // Otherwise form a 128-bit permutation. After accounting for undefs, // convert the 64-bit shuffle mask selection values into 128-bit // selection bits by dividing the indexes by 2 and shifting into positions // defined by a vperm2*128 instruction's immediate control byte. // The immediate permute control byte looks like this: // [1:0] - select 128 bits from sources for low half of destination // [2] - ignore // [3] - zero low half of destination // [5:4] - select 128 bits from sources for high half of destination // [6] - ignore // [7] - zero high half of destination assert((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?"); unsigned PermMask = 0; PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); // Check the immediate mask and replace unused sources with undef. if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00) V1 = DAG.getUNDEF(VT); if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20) V2 = DAG.getUNDEF(VT); return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// /// This attempts to create a repeated lane shuffle where each lane uses one /// or two of the lanes of the inputs. The lanes of the input vectors are /// shuffled in one or two independent shuffles to get the lanes into the /// position needed by the final shuffle. static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!V2.isUndef() && "This is only useful with multiple inputs."); if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = 128 / VT.getScalarSizeInBits(); SmallVector RepeatMask(NumLaneElts, -1); SmallVector, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. for (int Lane = 0; Lane != NumLanes; ++Lane) { int Srcs[2] = {-1, -1}; SmallVector InLaneMask(NumLaneElts, -1); for (int i = 0; i != NumLaneElts; ++i) { int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. int LaneSrc = M / NumLaneElts; int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; else if (Srcs[1] < 0 || Srcs[1] == LaneSrc) Src = 1; else return SDValue(); Srcs[Src] = LaneSrc; InLaneMask[i] = (M % NumLaneElts) + Src * NumElts; } // If this lane has two sources, see if it fits with the repeat mask so far. if (Srcs[1] < 0) continue; LaneSrcs[Lane][0] = Srcs[0]; LaneSrcs[Lane][1] = Srcs[1]; auto MatchMasks = [](ArrayRef M1, ArrayRef M2) { assert(M1.size() == M2.size() && "Unexpected mask size"); for (int i = 0, e = M1.size(); i != e; ++i) if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i]) return false; return true; }; auto MergeMasks = [](ArrayRef Mask, MutableArrayRef MergedMask) { assert(Mask.size() == MergedMask.size() && "Unexpected mask size"); for (int i = 0, e = MergedMask.size(); i != e; ++i) { int M = Mask[i]; if (M < 0) continue; assert((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"); MergedMask[i] = M; } }; if (MatchMasks(InLaneMask, RepeatMask)) { // Merge this lane mask into the final repeat mask. MergeMasks(InLaneMask, RepeatMask); continue; } // Didn't find a match. Swap the operands and try again. std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]); ShuffleVectorSDNode::commuteMask(InLaneMask); if (MatchMasks(InLaneMask, RepeatMask)) { // Merge this lane mask into the final repeat mask. MergeMasks(InLaneMask, RepeatMask); continue; } // Couldn't find a match with the operands in either order. return SDValue(); } // Now handle any lanes with only one source. for (int Lane = 0; Lane != NumLanes; ++Lane) { // If this lane has already been processed, skip it. if (LaneSrcs[Lane][0] >= 0) continue; for (int i = 0; i != NumLaneElts; ++i) { int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // If RepeatMask isn't defined yet we can define it ourself. if (RepeatMask[i] < 0) RepeatMask[i] = M % NumLaneElts; if (RepeatMask[i] < NumElts) { if (RepeatMask[i] != M % NumLaneElts) return SDValue(); LaneSrcs[Lane][0] = M / NumLaneElts; } else { if (RepeatMask[i] != ((M % NumLaneElts) + NumElts)) return SDValue(); LaneSrcs[Lane][1] = M / NumLaneElts; } } if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0) return SDValue(); } SmallVector NewMask(NumElts, -1); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][0]; for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) M = Src * NumLaneElts + i; NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); // Ensure we didn't get back the shuffle we started with. // FIXME: This is a hack to make up for some splat handling code in // getVectorShuffle. if (isa(NewV1) && cast(NewV1)->getMask() == Mask) return SDValue(); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][1]; for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) M = Src * NumLaneElts + i; NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); // Ensure we didn't get back the shuffle we started with. // FIXME: This is a hack to make up for some splat handling code in // getVectorShuffle. if (isa(NewV2) && cast(NewV2)->getMask() == Mask) return SDValue(); for (int i = 0; i != NumElts; ++i) { if (Mask[i] < 0) { NewMask[i] = -1; continue; } NewMask[i] = RepeatMask[i % NumLaneElts]; if (NewMask[i] < 0) continue; NewMask[i] += (i / NumLaneElts) * NumLaneElts; } return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } /// If the input shuffle mask results in a vector that is undefined in all upper /// or lower half elements and that mask accesses only 2 halves of the /// shuffle's operands, return true. A mask of half the width with mask indexes /// adjusted to access the extracted halves of the original shuffle operands is /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or /// lower half of each input operand is accessed. static bool getHalfShuffleMask(ArrayRef Mask, MutableArrayRef HalfMask, int &HalfIdx1, int &HalfIdx2) { assert((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"); // Exactly one half of the result must be undef to allow narrowing. bool UndefLower = isUndefLowerHalf(Mask); bool UndefUpper = isUndefUpperHalf(Mask); if (UndefLower == UndefUpper) return false; unsigned HalfNumElts = HalfMask.size(); unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; HalfIdx1 = -1; HalfIdx2 = -1; for (unsigned i = 0; i != HalfNumElts; ++i) { int M = Mask[i + MaskIndexOffset]; if (M < 0) { HalfMask[i] = M; continue; } // Determine which of the 4 half vectors this element is from. // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. int HalfIdx = M / HalfNumElts; // Determine the element index into its half vector source. int HalfElt = M % HalfNumElts; // We can shuffle with up to 2 half vectors, set the new 'half' // shuffle mask accordingly. if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { HalfMask[i] = HalfElt; HalfIdx1 = HalfIdx; continue; } if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { HalfMask[i] = HalfElt + HalfNumElts; HalfIdx2 = HalfIdx; continue; } // Too many half vectors referenced. return false; } return true; } /// Given the output values from getHalfShuffleMask(), create a half width /// shuffle of extracted vectors followed by an insert back to full width. static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat = false) { assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); assert(V1.getValueType().isSimple() && "Expecting only simple types"); MVT VT = V1.getSimpleValueType(); MVT HalfVT = VT.getHalfNumVectorElementsVT(); unsigned HalfNumElts = HalfVT.getVectorNumElements(); auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); SDValue V = (HalfIdx < 2 ? V1 : V2); HalfIdx = (HalfIdx % 2) * HalfNumElts; return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, DAG.getIntPtrConstant(HalfIdx, DL)); }; // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset SDValue Half1 = getHalfVector(HalfIdx1); SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); if (UseConcat) { SDValue Op0 = V; SDValue Op1 = DAG.getUNDEF(HalfVT); if (UndefLower) std::swap(Op0, Op1); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1); } unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); } /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"); bool UndefLower = isUndefLowerHalf(Mask); if (!UndefLower && !isUndefUpperHalf(Mask)) return SDValue(); assert((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"); // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> MVT HalfVT = VT.getHalfNumVectorElementsVT(); unsigned HalfNumElts = HalfVT.getVectorNumElements(); if (!UndefLower && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(HalfNumElts, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(0, DL)); } // Lower half is undef and upper half is whole lower subvector. // e.g. vector_shuffle or if (UndefLower && isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, DAG.getIntPtrConstant(HalfNumElts, DL)); } int HalfIdx1, HalfIdx2; SmallVector HalfMask(HalfNumElts); if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) return SDValue(); assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); // Only shuffle the halves of the inputs when useful. unsigned NumLowerHalves = (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); unsigned NumUpperHalves = (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"); // Determine the larger pattern of undef/halves, then decide if it's worth // splitting the shuffle based on subtarget capabilities and types. unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); if (!UndefLower) { // XXXXuuuu: no insert is needed. // Always extract lowers when setting lower - these are all free subreg ops. if (NumUpperHalves == 0) return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); if (NumUpperHalves == 1) { // AVX2 has efficient 32/64-bit element cross-lane shuffles. if (Subtarget.hasAVX2()) { // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask, DAG) && (!isSingleSHUFPSMask(HalfMask) || Subtarget.hasFastVariableCrossLaneShuffle())) return SDValue(); // If this is a unary shuffle (assume that the 2nd operand is // canonicalized to undef), then we can use vpermpd. Otherwise, we // are better off extracting the upper half of 1 operand and using a // narrow shuffle. if (EltWidth == 64 && V2.isUndef()) return SDValue(); } // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) return SDValue(); // Extract + narrow shuffle is better than the wide alternative. return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); } // Don't extract both uppers, instead shuffle and then extract. assert(NumUpperHalves == 2 && "Half vector count went wrong"); return SDValue(); } // UndefLower - uuuuXXXX: an insert to high half is required if we split this. if (NumUpperHalves == 0) { // AVX2 has efficient 64-bit element cross-lane shuffles. // TODO: Refine to account for unary shuffle, splat, and other masks? if (Subtarget.hasAVX2() && EltWidth == 64) return SDValue(); // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. if (Subtarget.hasAVX512() && VT.is512BitVector()) return SDValue(); // Narrow shuffle + insert is better than the wide alternative. return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, UndefLower, DAG); } // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. return SDValue(); } /// Handle case where shuffle sources are coming from the same 128-bit lane and /// every lane can be represented as the same repeating mask - allowing us to /// shuffle the sources with the repeating shuffle and then permute the result /// to the destination lanes. static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; // On AVX2 we may be able to just shuffle the lowest elements and then // broadcast the result. if (Subtarget.hasAVX2()) { for (unsigned BroadcastSize : {16, 32, 64}) { if (BroadcastSize <= VT.getScalarSizeInBits()) continue; int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); // Attempt to match a repeating pattern every NumBroadcastElts, // accounting for UNDEFs but only references the lowest 128-bit // lane of the inputs. auto FindRepeatingBroadcastMask = [&](SmallVectorImpl &RepeatMask) { for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) { int M = Mask[i + j]; if (M < 0) continue; int &R = RepeatMask[j]; if (0 != ((M % NumElts) / NumLaneElts)) return false; if (0 <= R && R != M) return false; R = M; } return true; }; SmallVector RepeatMask((unsigned)NumElts, -1); if (!FindRepeatingBroadcastMask(RepeatMask)) continue; // Shuffle the (lowest) repeated elements in place for broadcast. SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); // Shuffle the actual broadcast. SmallVector BroadcastMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumBroadcastElts) for (int j = 0; j != NumBroadcastElts; ++j) BroadcastMask[i + j] = j; // Avoid returning the same shuffle operation. For example, // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32 if (BroadcastMask == Mask) return SDValue(); return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), BroadcastMask); } } // Bail if the shuffle mask doesn't cross 128-bit lanes. if (!is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); // Bail if we already have a repeated lane shuffle mask. if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); // Helper to look for repeated mask in each split sublane, and that those // sublanes can then be permuted into place. auto ShuffleSubLanes = [&](int SubLaneScale) { int NumSubLanes = NumLanes * SubLaneScale; int NumSubLaneElts = NumLaneElts / SubLaneScale; // Check that all the sources are coming from the same lane and see if we // can form a repeating shuffle mask (local to each sub-lane). At the same // time, determine the source sub-lane for each destination sub-lane. int TopSrcSubLane = -1; SmallVector Dst2SrcSubLanes((unsigned)NumSubLanes, -1); SmallVector> RepeatedSubLaneMasks( SubLaneScale, SmallVector((unsigned)NumSubLaneElts, SM_SentinelUndef)); for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { // Extract the sub-lane mask, check that it all comes from the same lane // and normalize the mask entries to come from the first lane. int SrcLane = -1; SmallVector SubLaneMask((unsigned)NumSubLaneElts, -1); for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; if (M < 0) continue; int Lane = (M % NumElts) / NumLaneElts; if ((0 <= SrcLane) && (SrcLane != Lane)) return SDValue(); SrcLane = Lane; int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); SubLaneMask[Elt] = LocalM; } // Whole sub-lane is UNDEF. if (SrcLane < 0) continue; // Attempt to match against the candidate repeated sub-lane masks. for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { auto MatchMasks = [NumSubLaneElts](ArrayRef M1, ArrayRef M2) { for (int i = 0; i != NumSubLaneElts; ++i) { if (M1[i] < 0 || M2[i] < 0) continue; if (M1[i] != M2[i]) return false; } return true; }; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) continue; // Merge the sub-lane mask into the matching repeated sub-lane mask. for (int i = 0; i != NumSubLaneElts; ++i) { int M = SubLaneMask[i]; if (M < 0) continue; assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && "Unexpected mask element"); RepeatedSubLaneMask[i] = M; } // Track the top most source sub-lane - by setting the remaining to // UNDEF we can greatly simplify shuffle matching. int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); Dst2SrcSubLanes[DstSubLane] = SrcSubLane; break; } // Bail if we failed to find a matching repeated sub-lane mask. if (Dst2SrcSubLanes[DstSubLane] < 0) return SDValue(); } assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane"); // Create a repeating shuffle mask for the entire vector. SmallVector RepeatedMask((unsigned)NumElts, -1); for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { int Lane = SubLane / SubLaneScale; auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { int M = RepeatedSubLaneMask[Elt]; if (M < 0) continue; int Idx = (SubLane * NumSubLaneElts) + Elt; RepeatedMask[Idx] = M + (Lane * NumLaneElts); } } // Shuffle each source sub-lane to its destination. SmallVector SubLaneMask((unsigned)NumElts, -1); for (int i = 0; i != NumElts; i += NumSubLaneElts) { int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; if (SrcSubLane < 0) continue; for (int j = 0; j != NumSubLaneElts; ++j) SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); } // Avoid returning the same shuffle operation. // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32 if (RepeatedMask == Mask || SubLaneMask == Mask) return SDValue(); SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), SubLaneMask); }; // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes, // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors. // Otherwise we can only permute whole 128-bit lanes. int MinSubLaneScale = 1, MaxSubLaneScale = 1; if (Subtarget.hasAVX2() && VT.is256BitVector()) { bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts); MinSubLaneScale = 2; MaxSubLaneScale = (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2; } if (Subtarget.hasBWI() && VT == MVT::v64i8) MinSubLaneScale = MaxSubLaneScale = 4; for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2) if (SDValue Shuffle = ShuffleSubLanes(Scale)) return Shuffle; return SDValue(); } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef Mask, const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"); bool ZeroLane[2] = { true, true }; for (int i = 0; i < NumElts; ++i) ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. ShuffleImm = 0; bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1]) continue; if (Mask[i] < 0) return false; int Val = (i & 6) + NumElts * (i & 1); int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) CommutableMask = false; ShuffleImm |= (Mask[i] % 2) << i; } if (!ShufpdMask && !CommutableMask) return false; if (!ShufpdMask && CommutableMask) std::swap(V1, V2); ForceV1Zero = ZeroLane[0]; ForceV2Zero = ZeroLane[1]; return true; } static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, Mask, Zeroable)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, DAG.getTargetConstant(Immediate, DL, MVT::i8)); } // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed // by zeroable elements in the remaining 24 elements. Turn this into two // vmovqb instructions shuffled together. static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(VT == MVT::v32i8 && "Unexpected type!"); // The first 8 indices should be every 8th element. if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) return SDValue(); // Remaining elements need to be zeroable. if (Zeroable.countl_one() < (Mask.size() - 8)) return SDValue(); V1 = DAG.getBitcast(MVT::v4i64, V1); V2 = DAG.getBitcast(MVT::v4i64, V2); V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in // the upper bits of the result using an unpckldq. SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }); // Insert the unpckldq into a zero vector to widen to v32i8. return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, DAG.getConstant(0, DL, MVT::v32i8), Unpack, DAG.getIntPtrConstant(0, DL)); } // a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2 // b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2 // => // ul = unpckl v1, v2 // uh = unpckh v1, v2 // a = vperm ul, uh // b = vperm ul, uh // // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck // and permute. We cannot directly match v3 because it is split into two // 256-bit vectors in earlier isel stages. Therefore, this function matches a // pair of 256-bit shuffles and makes sure the masks are consecutive. // // Once unpck and permute nodes are created, the permute corresponding to this // shuffle is returned, while the other permute replaces the other half of the // shuffle in the selection dag. static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 && VT != MVT::v32i8) return SDValue(); // auto IsInterleavingPattern = [&](ArrayRef Mask, unsigned Begin0, unsigned Begin1) { size_t Size = Mask.size(); assert(Size % 2 == 0 && "Expected even mask size"); for (unsigned I = 0; I < Size; I += 2) { if (Mask[I] != (int)(Begin0 + I / 2) || Mask[I + 1] != (int)(Begin1 + I / 2)) return false; } return true; }; // Check which half is this shuffle node int NumElts = VT.getVectorNumElements(); size_t FirstQtr = NumElts / 2; size_t ThirdQtr = NumElts + NumElts / 2; bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts); bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr); if (!IsFirstHalf && !IsSecondHalf) return SDValue(); // Find the intersection between shuffle users of V1 and V2. SmallVector Shuffles; for (SDNode *User : V1->uses()) if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 && User->getOperand(1) == V2) Shuffles.push_back(User); // Limit user size to two for now. if (Shuffles.size() != 2) return SDValue(); // Find out which half of the 512-bit shuffles is each smaller shuffle auto *SVN1 = cast(Shuffles[0]); auto *SVN2 = cast(Shuffles[1]); SDNode *FirstHalf; SDNode *SecondHalf; if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) && IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) { FirstHalf = Shuffles[0]; SecondHalf = Shuffles[1]; } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) && IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) { FirstHalf = Shuffles[1]; SecondHalf = Shuffles[0]; } else { return SDValue(); } // Lower into unpck and perm. Return the perm of this shuffle and replace // the other. SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh, DAG.getTargetConstant(0x20, DL, MVT::i8)); SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh, DAG.getTargetConstant(0x31, DL, MVT::i8)); if (IsFirstHalf) { DAG.ReplaceAllUsesWith(SecondHalf, &Perm2); return Perm1; } DAG.ReplaceAllUsesWith(FirstHalf, &Perm1); return Perm2; } /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. if (Subtarget.hasAVX2()) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) return V; // Otherwise, fall back. return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Op; bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently // canonicalize to a blend of splat which isn't necessary for this combine. if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && (V1.getOpcode() != ISD::BUILD_VECTOR) && (V2.getOpcode() != ISD::BUILD_VECTOR)) return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG); // If we have one input in place, then we can permute the other input and // blend the result. if (V1IsInPlace || V2IsInPlace) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace))) if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 4-lane 64-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"); if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions if fast. if (Subtarget.preferLowerShuffleAsShift()) if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ true)) return Shift; if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on both lanes. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { SmallVector PSHUFDMask; narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, DAG.getBitcast(MVT::v8i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } // AVX2 provides a direct instruction for permuting a single input across // lanes. return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use PALIGNR. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); // If we have one input in place, then we can permute the other input and // blend the result. if (V1IsInPlace || V2IsInPlace) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return V; // Try to lower to PERMQ(BLENDD(V1,V2)). if (SDValue V = lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG)) return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. if (!V1IsInPlace && !V2IsInPlace) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit floating point shuffles. /// /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (!Subtarget.hasAVX2()) { SmallVector InLaneMask; computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask); if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask)) if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG, /*SimpleOnly*/ true)) return R; } if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return DAG.getBitcast(MVT::v8f32, ZExt); // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (V2.isUndef()) { if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); } if (Subtarget.hasAVX2()) { SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); } // Otherwise, fall back. return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) return Result; // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; // Try to match an interleave of two v8f32s and lower them as unpck and // permutes using ymms. This needs to go before we try to split the vectors. // // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits // this path inadvertently. if (Subtarget.hasAVX2() && !Subtarget.hasAVX512()) if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2, Mask, DAG)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code using vpunpcklwd and // vpunpckhwd instrs than vblend. if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG)) return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 8-lane 32-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; }); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Try to match an interleave of two v8i32s and lower them as unpck and // permutes using ymms. This needs to go before we try to split the vectors. if (!Subtarget.hasAVX512()) if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2, Mask, DAG)) return V; // For non-AVX512 if the Mask is of 16bit elements in lane then try to split // since after split we get a more efficient code than vblend by using // vpunpcklwd and vpunpckhwd instrs. if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() && !Subtarget.hasAVX512()) return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Try to use shift instructions if fast. if (Subtarget.preferLowerShuffleAsShift()) { if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ true)) return Shift; if (NumV2Elements == 0) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) return Rotate; } // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) return Rotate; // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; } // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; if (V2.isUndef()) { // Try to produce a fixed cross-128-bit lane permute followed by unpack // because that should be faster than the variable permute alternatives. if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG)) return V; // If the shuffle patterns aren't repeated but it's a single input, directly // generate a cross-lane VPERMD instruction. SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v8i32, ShufPS); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Result; // Otherwise fall back on generic blend lowering. return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 16-lane 16-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use lower using a truncation. if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return V; if (V2.isUndef()) { // Try to use bit rotation instructions. if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG)) return Rotate; // Try to produce a fixed cross-128-bit lane permute followed by unpack // because that should be faster than the variable permute alternatives. if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget); } SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v16 case. return lowerV8I16GeneralSingleInputShuffle( DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16). if (Subtarget.hasBWI()) return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; // Try to match an interleave of two v16i16s and lower them as unpck and // permutes using ymms. if (!Subtarget.hasAVX512()) if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2, Mask, DAG)) return V; // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG); } /// Handle lowering of 32-lane 8-bit integer shuffles. /// /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use lower using a truncation. if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use bit rotation instructions. if (V2.isUndef()) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG)) return Rotate; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return V; // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { // Try to produce a fixed cross-128-bit lane permute followed by unpack // because that should be faster than the variable permute alternatives. if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget); } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG); // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed // by zeroable elements in the remaining 24 elements. Turn this into two // vmovqb instructions shuffled together. if (Subtarget.hasVLX()) if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, Mask, Zeroable, DAG)) return V; // Try to match an interleave of two v32i8s and lower them as unpck and // permutes using ymms. if (!Subtarget.hasAVX512()) if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2, Mask, DAG)) return V; // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); } /// High-level routine to lower various 256-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 256-bit x86 vector /// shuffle or splits it into two 128-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = VT.getVectorNumElements(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have // essentially *zero* ability to manipulate a 256-bit vector with integer // types. Since we'll use floating point types there eventually, just // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget.hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) { // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), VT.getVectorNumElements()); V1 = DAG.getBitcast(FpVT, V1); V2 = DAG.getBitcast(FpVT, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } if (VT == MVT::v16f16 || VT == MVT::v16bf16) { V1 = DAG.getBitcast(MVT::v16i16, V1); V2 = DAG.getBitcast(MVT::v16i16, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask)); } switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v4i64: return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8f32: return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i32: return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i16: return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i8: return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } /// Try to lower a vector shuffle as a 128-bit shuffles. static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."); // To handle 256 bit vector requires VLX and most probably // function lowerV2X128VectorShuffle() is better solution. assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); // TODO - use Zeroable like we do for lowerV2X128VectorShuffle? SmallVector Widened128Mask; if (!canWidenShuffleElements(Mask, Widened128Mask)) return SDValue(); assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch"); // Try to use an insert into a zero vector. if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 && (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) { unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4; MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), LoV, DAG.getIntPtrConstant(0, DL)); } // Check for patterns which can be matched with a single insert of a 256-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2); if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(4, DL)); } // See if this is an insertion of the lower 128-bits of V2 into V1. bool IsInsert = true; int V2Index = -1; for (int i = 0; i < 4; ++i) { assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"); if (Widened128Mask[i] < 0) continue; // Make sure all V1 subvectors are in place. if (Widened128Mask[i] < 4) { if (Widened128Mask[i] != i) { IsInsert = false; break; } } else { // Make sure we only have a single V2 index and its the lowest 128-bits. if (V2Index >= 0 || Widened128Mask[i] != 4) { IsInsert = false; break; } V2Index = i; } } if (IsInsert && V2Index >= 0) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, DAG.getIntPtrConstant(0, DL)); return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); } // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where // possible we at least ensure the lanes stay sequential to help later // combines. SmallVector Widened256Mask; if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) { Widened128Mask.clear(); narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask); } // Try to lower to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; int PermMask[4] = {-1, -1, -1, -1}; // Ensure elements came from the same Op. for (int i = 0; i < 4; ++i) { assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"); if (Widened128Mask[i] < 0) continue; SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); PermMask[i] = Widened128Mask[i] % 4; } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], getV4X86ShuffleImm8ForMask(PermMask, DL, DAG)); } /// Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); if (V2.isUndef()) { // Use low duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2)) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { // Non-half-crossing single input shuffles can be lowered with an // interleaved permutation. unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2, Subtarget, DAG)) return Shuf128; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Op; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // If the shuffle mask is repeated in each 128-bit lane, we have many more // options to efficiently lower the shuffle. SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); // Use even/odd duplicate instructions for masks that match their pattern. if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); if (V2.isUndef()) return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Otherwise, fall back to a SHUFPS sequence. return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return DAG.getBitcast(MVT::v16f32, ZExt); // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); } // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); // Try to use shift instructions if fast. if (Subtarget.preferLowerShuffleAsShift()) if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ true)) return Shift; if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four // 128-bit lanes. SmallVector Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { SmallVector PSHUFDMask; narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v8i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, DAG.getBitcast(MVT::v16i32, V1), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } SmallVector Repeated256Mask; if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, V2, Subtarget, DAG)) return Shuf128; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; // Try to use PALIGNR. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Try to use shift instructions if fast. if (Subtarget.preferLowerShuffleAsShift()) { if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ true)) return Shift; if (NumV2Elements == 0) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG)) return Rotate; } // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the four 128-bit // lanes. SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); if (Is128BitLaneRepeatedShuffle) { assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); if (V2.isUndef()) return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) return V; } // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG)) return Rotate; // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Assume that a single SHUFPS is faster than using a permv shuffle. // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return V; // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG)) return Rotate; if (V2.isUndef()) { // Try to use bit rotation instructions. if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG)) return Rotate; SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG); } /// Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) return V; // Use dedicated pack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, Subtarget)) return V; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; // Try to use bit rotation instructions. if (V2.isUndef()) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG)) return Rotate; // Lower as AND if possible. if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; if (SDValue Result = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget)) return Result; if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) { // Use PALIGNR+Permute if possible - permute might become PSHUFB but the // PALIGNR will be cheaper than the second PSHUFB+OR. if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; // If we can't directly blend but can use PSHUFB, that will be better as it // can both shuffle and set up the inefficient blend. bool V1InUse, V2InUse; return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); } // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (!V2.isUndef()) if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return Result; // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } /// High-level routine to lower various 512-bit x86 vector shuffles. /// /// This routine either breaks down the specific type of a 512-bit x86 vector /// shuffle or splits it into two 256-bit shuffles and fuses the results back /// together based on the available instructions. static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"); // If we have a single input to the zero element, insert that into V1 if we // can do so cheaply. int NumElts = Mask.size(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); if (NumV2Elements == 1 && Mask[0] >= NumElts) if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Handle special cases where the lower or upper half is UNDEF. if (SDValue V = lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) return Broadcast; if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) { // Try using bit ops for masking and blending before falling back to // splitting. if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } if (VT == MVT::v32f16 || VT == MVT::v32bf16) { if (!Subtarget.hasBWI()) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); V1 = DAG.getBitcast(MVT::v32i16, V1); V2 = DAG.getBitcast(MVT::v32i16, V2); return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask)); } // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16f32: return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v32i16: return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v64i8: return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 512-bit x86 vector type!"); } } static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Shuffle should be unary. if (!V2.isUndef()) return SDValue(); int ShiftAmt = -1; int NumElts = Mask.size(); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) && "Unexpected mask index."); if (M < 0) continue; // The first non-undef element determines our shift amount. if (ShiftAmt < 0) { ShiftAmt = M - i; // Need to be shifting right. if (ShiftAmt <= 0) return SDValue(); } // All non-undef elements must shift by the same amount. if (ShiftAmt != M - i) return SDValue(); } assert(ShiftAmt >= 0 && "All undef?"); // Great we found a shift right. SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL); Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified // version of matchShuffleAsShift. static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef Mask, int MaskOffset, const APInt &Zeroable) { int Size = Mask.size(); auto CheckZeros = [&](int Shift, bool Left) { for (int j = 0; j < Shift; ++j) if (!Zeroable[j + (Left ? 0 : (Size - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, bool Left) { unsigned Pos = Left ? Shift : 0; unsigned Low = Left ? 0 : Shift; unsigned Len = Size - Shift; return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset); }; for (int Shift = 1; Shift != Size; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) { Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR; return Shift; } return -1; } // Lower vXi1 vector shuffles. // There is no a dedicated instruction on AVX-512 that shuffles the masks. // The only way to shuffle bits is to sign-extend the mask vector to SIMD // vector, shuffle and then truncate it back. static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); int NumElts = Mask.size(); int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); // Try to recognize shuffles that are just padding a subvector with zeros. int SubvecElts = 0; int Src = -1; for (int i = 0; i != NumElts; ++i) { if (Mask[i] >= 0) { // Grab the source from the first valid mask. All subsequent elements need // to use this same source. if (Src < 0) Src = Mask[i] / NumElts; if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i) break; } ++SubvecElts; } assert(SubvecElts != NumElts && "Identity shuffle?"); // Clip to a power 2. SubvecElts = llvm::bit_floor(SubvecElts); // Make sure the number of zeroable bits in the top at least covers the bits // not covered by the subvector. if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) { assert(Src >= 0 && "Expected a source!"); MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getConstant(0, DL, VT), Extract, DAG.getIntPtrConstant(0, DL)); } // Try a simple shift right with undef elements. Later we'll try with zeros. if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG)) return Shift; // Try to match KSHIFTs. unsigned Offset = 0; for (SDValue V : { V1, V2 }) { unsigned Opcode; int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); if (ShiftAmt >= 0) { SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL); MVT WideVT = Res.getSimpleValueType(); // Widened right shifts need two shifts to ensure we shift in zeroes. if (Opcode == X86ISD::KSHIFTR && WideVT != VT) { int WideElts = WideVT.getVectorNumElements(); // Shift left to put the original vector in the MSBs of the new size. Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res, DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8)); // Increase the shift amount to account for the left shift. ShiftAmt += WideElts - NumElts; } Res = DAG.getNode(Opcode, DL, WideVT, Res, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } Offset += NumElts; // Increment for next iteration. } // If we're performing an unary shuffle on a SETCC result, try to shuffle the // ops instead. // TODO: What other unary shuffles would benefit from this? if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) { SDValue Op0 = V1.getOperand(0); SDValue Op1 = V1.getOperand(1); ISD::CondCode CC = cast(V1.getOperand(2))->get(); EVT OpVT = Op0.getValueType(); if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask)) return DAG.getSetCC( DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask), DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC); } MVT ExtVT; switch (VT.SimpleTy) { default: llvm_unreachable("Expected a vector of i1 elements"); case MVT::v2i1: ExtVT = MVT::v2i64; break; case MVT::v4i1: ExtVT = MVT::v4i32; break; case MVT::v8i1: // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit // shuffle. ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; break; case MVT::v16i1: // Take 512-bit type, unless we are avoiding 512-bit types and have the // 256-bit operation available. ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16; break; case MVT::v32i1: // Take 512-bit type, unless we are avoiding 512-bit types and have the // 256-bit operation available. assert(Subtarget.hasBWI() && "Expected AVX512BW support"); ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; break; case MVT::v64i1: // Fall back to scalarization. FIXME: We can do better if the shuffle // can be partitioned cleanly. if (!Subtarget.useBWIRegs()) return SDValue(); ExtVT = MVT::v64i8; break; } V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); // i1 was sign extended we can use X86ISD::CVT2MASK. int NumElems = VT.getVectorNumElements(); if ((Subtarget.hasBWI() && (NumElems >= 32)) || (Subtarget.hasDQI() && (NumElems < 32))) return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle, ISD::SETGT); return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); } /// Helper function that returns true if the shuffle mask should be /// commuted to improve canonicalization. static bool canonicalizeShuffleMaskWithCommute(ArrayRef Mask) { int NumElements = Mask.size(); int NumV1Elements = 0, NumV2Elements = 0; for (int M : Mask) if (M < 0) continue; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return true; assert(NumV1Elements > 0 && "No V1 indices"); if (NumV2Elements == 0) return false; // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. When that is tied, // ensure that the sum of indices for V1 is equal to or lower than the sum // indices for V2. When those are equal, try to ensure that the number of odd // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : Mask.slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) return true; if (LowV2Elements == LowV1Elements) { int SumV1Indices = 0, SumV2Indices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) SumV2Indices += i; else if (Mask[i] >= 0) SumV1Indices += i; if (SumV2Indices < SumV1Indices) return true; if (SumV2Indices == SumV1Indices) { int NumV1OddIndices = 0, NumV2OddIndices = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= NumElements) NumV2OddIndices += i % 2; else if (Mask[i] >= 0) NumV1OddIndices += i % 2; if (NumV2OddIndices < NumV1OddIndices) return true; } } } return false; } static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return false; if (!V.getValueType().isSimple()) return false; MVT VT = V.getSimpleValueType().getScalarType(); if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) return false; // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd // are preferable to blendw/blendvb/masked-mov. if ((VT == MVT::i16 || VT == MVT::i8) && V.getSimpleValueType().getSizeInBits() < 512) return false; auto HasMaskOperation = [&](SDValue V) { // TODO: Currently we only check limited opcode. We probably extend // it to all binary operation by checking TLI.isBinOp(). switch (V->getOpcode()) { default: return false; case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::XOR: case ISD::OR: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: case ISD::ABS: case ISD::SHL: case ISD::SRL: case ISD::SRA: case ISD::MUL: break; } if (!V->hasOneUse()) return false; return true; }; if (HasMaskOperation(V)) return true; return false; } // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget); /// Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc DL(Op); bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); assert((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"); bool V1IsUndef = V1.isUndef(); bool V2IsUndef = V2.isUndef(); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return DAG.getCommutedVectorShuffle(*SVOp); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef && any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { SmallVector NewMask(OrigMask); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); } // Check for illegal shuffle mask element index values. int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; assert(llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. APInt KnownUndef, KnownZero; computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero); APInt Zeroable = KnownUndef | KnownZero; if (Zeroable.isAllOnes()) return getZeroVector(VT, Subtarget, DAG, DL); bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits. It does not seem beneficial to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && !canCombineAsMaskOperation(V1, Subtarget) && !canCombineAsMaskOperation(V2, Subtarget) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG)) return Broadcast; MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); int NewNumElts = NumElements / 2; MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { if (V2IsZero) { // Modify the new Mask to take all zeros from the all-zero vector. // Choose indices that are blend-friendly. bool UsedZeroVector = false; assert(is_contained(WidenedMask, SM_SentinelZero) && "V2's non-undef elements are used?!"); for (int i = 0; i != NewNumElts; ++i) if (WidenedMask[i] == SM_SentinelZero) { WidenedMask[i] = i + NewNumElts; UsedZeroVector = true; } // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits // some elements to be undef. if (UsedZeroVector) V2 = getZeroVector(NewVT, Subtarget, DAG, DL); } V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); } } SmallVector Ops = {V1, V2}; SmallVector Mask(OrigMask); // Canonicalize the shuffle with any horizontal ops inputs. // NOTE: This may update Ops and Mask. if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget)) return DAG.getBitcast(VT, HOp); V1 = DAG.getBitcast(VT, Ops[0]); V2 = DAG.getBitcast(VT, Ops[1]); assert(NumElements == (int)Mask.size() && "canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"); // Commute the shuffle if it will improve canonicalization. if (canonicalizeShuffleMaskWithCommute(Mask)) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is256BitVector()) return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (VT.is512BitVector()) return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); if (Is1BitVector) return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } /// Try to lower a VSELECT instruction to a vector shuffle. static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); // Only non-legal VSELECTs reach this lowering, convert those into generic // shuffles and re-use the shuffle lowering path for blends. if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond)) return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask); } return SDValue(); } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeVectorElementTypeToInteger(); return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond, DAG.getBitcast(NVT, LHS), DAG.getBitcast(NVT, RHS))); } // A vselect where all conditions and data are constants can be optimized into // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; // If this VSELECT has a vector if i1 as a mask, it will be directly matched // with patterns on the mask registers on AVX-512. MVT CondVT = Cond.getSimpleValueType(); unsigned CondEltSize = Cond.getScalarValueSizeInBits(); if (CondEltSize == 1) return Op; // Variable blends are only legal from SSE4.1 onward. if (!Subtarget.hasSSE41()) return SDValue(); unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); // Expand v32i16/v64i8 without BWI. if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) return SDValue(); // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition // into an i1 condition so that we can use the mask-based 512-bit blend // instructions. if (VT.getSizeInBits() == 512) { // Build a mask by testing the condition against zero. MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, DAG.getConstant(0, dl, CondVT), ISD::SETNE); // Now return a new VSELECT using the mask. return DAG.getSelect(dl, VT, Mask, LHS, RHS); } // SEXT/TRUNC cases where the mask doesn't match the destination size. if (CondEltSize != EltSize) { // If we don't have a sign splat, rely on the expansion. if (CondEltSize != DAG.ComputeNumSignBits(Cond)) return SDValue(); MVT NewCondSVT = MVT::getIntegerVT(EltSize); MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts); Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT); return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS); } // v16i16/v32i8 selects without AVX2, if the condition and another operand // are free to split, then better to split before expanding the // select. Don't bother with XOP as it has the fast VPCMOV instruction. // TODO: This is very similar to narrowVectorSelect. // TODO: Add Load splitting to isFreeToSplitVector ? if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() && !Subtarget.hasXOP()) { bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG); bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) || (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse()); bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) || (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse()); if (FreeCond && (FreeLHS || FreeRHS)) return splitVectorOp(Op, DAG, dl); } // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; case MVT::v32i8: // The byte blends for AVX vectors were introduced only in AVX2. if (Subtarget.hasAVX2()) return Op; return SDValue(); case MVT::v8i16: case MVT::v16i16: { // Bitcast everything to the vXi8 type and use a vXi8 vselect. MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2); Cond = DAG.getBitcast(CastVT, Cond); LHS = DAG.getBitcast(CastVT, LHS); RHS = DAG.getBitcast(CastVT, RHS); SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS); return DAG.getBitcast(VT, Select); } } } static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); assert(isa(Idx) && "Constant index expected"); SDLoc dl(Op); if (!Vec.getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless // we're going to zero extend the register or fold the store. if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) && !X86::mayFoldIntoStore(Op)) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); unsigned IdxVal = Idx->getAsZExtVal(); SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx); return DAG.getBitcast(MVT::f32, Extract); } if (VT == MVT::i32 || VT == MVT::i64) return Op; return SDValue(); } /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); auto* IdxC = dyn_cast(Idx); MVT EltVT = Op.getSimpleValueType(); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, // extend vector to VR512/128 if (!IdxC) { unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. if (NumElts == 1) { Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl); MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements()); return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec)); } MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } unsigned IdxVal = IdxC->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; // Extend to natively supported kshift. Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl); // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); } // Helper to find all the extracted elements from a vector. static APInt getExtractedDemandedElts(SDNode *N) { MVT VT = N->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); APInt DemandedElts = APInt::getZero(NumElts); for (SDNode *User : N->uses()) { switch (User->getOpcode()) { case X86ISD::PEXTRB: case X86ISD::PEXTRW: case ISD::EXTRACT_VECTOR_ELT: if (!isa(User->getOperand(1))) { DemandedElts.setAllBits(); return DemandedElts; } DemandedElts.setBit(User->getConstantOperandVal(1)); break; case ISD::BITCAST: { if (!User->getValueType(0).isSimple() || !User->getValueType(0).isVector()) { DemandedElts.setAllBits(); return DemandedElts; } APInt DemandedSrcElts = getExtractedDemandedElts(User); DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts); break; } default: DemandedElts.setAllBits(); return DemandedElts; } } return DemandedElts; } SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); auto* IdxC = dyn_cast(Idx); if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG, Subtarget); if (!IdxC) { // Its more profitable to go through memory (1 cycles throughput) // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput) // IACA tool was used to get performance estimation // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) // // example : extractelement <16 x i8> %a, i32 %i // // Block Throughput: 3.00 Cycles // Throughput Bottleneck: Port5 // // | Num Of | Ports pressure in cycles | | // | Uops | 0 - DV | 5 | 6 | 7 | | // --------------------------------------------- // | 1 | | 1.0 | | | CP | vmovd xmm1, edi // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 // Total Num Of Uops: 4 // // // Block Throughput: 1.00 Cycles // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 // // | | Ports pressure in cycles | | // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | // --------------------------------------------------------- // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] // Total Num Of Uops: 4 return SDValue(); } unsigned IdxVal = IdxC->getZExtValue(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. if (VecVT.is256BitVector() || VecVT.is512BitVector()) { // Get the 128-bit vector. Vec = extract128BitVector(Vec, IdxVal, DAG, dl); MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 // this can be done with a mask. IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(IdxVal, dl)); } assert(VecVT.is128BitVector() && "Unexpected vector length"); MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless // we're going to zero extend the register or fold the store (SSE41 only). if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) && !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) { if (Subtarget.hasFP16()) return Op; return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); } SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (Subtarget.hasSSE41()) if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; // Only extract a single element from a v16i8 source - determine the common // DWORD/WORD that all extractions share, and extract the sub-byte. // TODO: Add QWORD MOVQ extraction? if (VT == MVT::i8) { APInt DemandedElts = getExtractedDemandedElts(Vec.getNode()); assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch"); // Extract either the lowest i32 or any i16, and extract the sub-byte. int DWordIdx = IdxVal / 4; if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), DAG.getIntPtrConstant(DWordIdx, dl)); int ShiftVal = (IdxVal % 4) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, DAG.getConstant(ShiftVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } int WordIdx = IdxVal / 2; if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, DAG.getBitcast(MVT::v8i16, Vec), DAG.getIntPtrConstant(WordIdx, dl)); int ShiftVal = (IdxVal % 2) * 8; if (ShiftVal != 0) Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, DAG.getConstant(ShiftVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } } if (VT == MVT::f16 || VT.getSizeInBits() == 32) { if (IdxVal == 0) return Op; // Shuffle the element to the lowest element, then movss or movsh. SmallVector Mask(VecVT.getVectorNumElements(), -1); Mask[0] = static_cast(IdxVal); Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() == 64) { // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. if (IdxVal == 0) return Op; // UNPCKHPD the element to the lowest double word, then movsd. // Note if the lower 64 bits of the result of the UNPCKHPD is then stored // to a f64mem, the whole operation is folded into a single MOVHPDmr. int Mask[2] = { 1, -1 }; Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); } return SDValue(); } /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); SDValue Idx = Op.getOperand(2); MVT VecVT = Vec.getSimpleValueType(); if (!isa(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. unsigned NumElts = VecVT.getVectorNumElements(); MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } // Copy into a k-register, extract to v1i1 and insert_subvector. SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = EltVT.getScalarSizeInBits(); if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG, Subtarget); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); auto *N2C = dyn_cast(N2); if (EltVT == MVT::bf16) { MVT IVT = VT.changeVectorElementTypeToInteger(); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT, DAG.getBitcast(IVT, N0), DAG.getBitcast(MVT::i16, N1), N2); return DAG.getBitcast(VT, Res); } if (!N2C) { // Variable insertion indices, usually we're better off spilling to stack, // but AVX512 can use a variable compare+select by comparing against all // possible vector indices, and FP insertion has less gpr->simd traffic. if (!(Subtarget.hasBWI() || (Subtarget.hasAVX512() && EltSizeInBits >= 32) || (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64)))) return SDValue(); MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits); MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts); if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT)) return SDValue(); SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT); SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt); SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1); SmallVector RawIndices; for (unsigned I = 0; I != NumElts; ++I) RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT)); SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices); // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0, ISD::CondCode::SETEQ); } if (N2C->getAPIntValue().uge(NumElts)) return SDValue(); uint64_t IdxVal = N2C->getZExtValue(); bool IsZeroElt = X86::isZeroNode(N1); bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); if (IsZeroElt || IsAllOnesElt) { // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend. // We don't deal with i8 0 since it appears to be handled elsewhere. if (IsAllOnesElt && ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) || ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) { SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType()); SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType()); SmallVector CstVectorElts(NumElts, ZeroCst); CstVectorElts[IdxVal] = OnesCst; SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts); return DAG.getNode(ISD::OR, dl, VT, N0, CstVector); } // See if we can do this more efficiently with a blend shuffle with a // rematerializable vector. if (Subtarget.hasSSE41() && (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) { SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) : getOnesVector(VT, DAG, dl); return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { // With a 256-bit vector, we can insert into the zero element efficiently // using a blend if we have AVX or AVX2 and the right data type. if (VT.is256BitVector() && IdxVal == 0) { // TODO: It is worthwhile to cast integer to floating point and back // and incur a domain crossing penalty if that's what we'll end up // doing anyway after extracting to a 128-bit vector. if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, DAG.getTargetConstant(1, dl, MVT::i8)); } } unsigned NumEltsIn128 = 128 / EltSizeInBits; assert(isPowerOf2_32(NumEltsIn128) && "Vectors will always have power-of-two number of elements."); // If we are not inserting into the low 128-bit vector chunk, // then prefer the broadcast+blend sequence. // FIXME: relax the profitability check iff all N1 uses are insertions. if (IdxVal >= NumEltsIn128 && ((Subtarget.hasAVX2() && EltSizeInBits != 8) || (Subtarget.hasAVX() && (EltSizeInBits >= 32) && X86::mayFoldLoad(N1, Subtarget)))) { SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask); } // Get the desired 128-bit vector chunk. SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getIntPtrConstant(IdxIn128, dl)); // Insert the changed part back into the bigger vector return insert128BitVector(N0, V, IdxVal, DAG, dl); } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); // This will be just movw/movd/movq/movsh/movss/movsd. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) { if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || EltVT == MVT::f16 || EltVT == MVT::i64) { N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1); MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1); N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); return DAG.getBitcast(VT, N1); } } // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. SSE41 required for pinsrb. if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { unsigned Opc; if (VT == MVT::v8i16) { assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); Opc = X86ISD::PINSRW; } else { assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); Opc = X86ISD::PINSRB; } assert(N1.getValueType() != MVT::i32 && "Unexpected VT"); N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8); return DAG.getNode(Opc, dl, VT, N0, N1, N2); } if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into // these bits. For example (insert (extract, 3), 2) could be matched by // putting the '3' into bits [7:6] of X86ISD::INSERTPS. // Bits [5:4] of the constant are the destination select. This is the // value of the incoming immediate. // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so // will always have equal or better performance than insertps. // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, DAG.getTargetConstant(1, dl, MVT::i8)); } // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8)); } // PINSR* works with constant index. if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; } return SDValue(); } static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); // It's always cheaper to replace a xor+movd with xorps and simplifies further // combines. if (X86::isZeroNode(Op.getOperand(0))) return getZeroVector(OpVT, Subtarget, DAG, dl); // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"); // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in // tblgen. if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16())) return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); return insert1BitVector(Op, DAG, Subtarget); } static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"); SDLoc dl(Op); SDValue Vec = Op.getOperand(0); uint64_t IdxVal = Op.getConstantOperandVal(1); if (IdxVal == 0) // the operation is legal return Op; // Extend to natively supported kshift. Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl); // Shift to the LSB. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); } // Returns the appropriate wrapper opcode for a global reference. unsigned X86TargetLowering::getGlobalWrapperKind( const GlobalValue *GV, const unsigned char OpFlags) const { // References to absolute symbols are never PC-relative. if (GV && GV->isAbsoluteSymbolRef()) return X86ISD::Wrapper; // The following OpFlags under RIP-rel PIC use RIP. if (Subtarget.isPICStyleRIPRel() && (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB || OpFlags == X86II::MO_DLLIMPORT)) return X86ISD::WrapperRIP; // GOTPCREL references must always use RIP. if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX) return X86ISD::WrapperRIP; return X86ISD::Wrapper; } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOV32ri. SDValue X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) { Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); } return Result; } SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { JumpTableSDNode *JT = cast(Op); // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); SDLoc DL(JT); Result = DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (OpFlag) Result = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); return Result; } SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } SDValue X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { // Create the TargetBlockAddressAddress node. unsigned char OpFlags = Subtarget.classifyBlockAddressReference(); const BlockAddress *BA = cast(Op)->getBlockAddress(); int64_t Offset = cast(Op)->getOffset(); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); Result = DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (isGlobalRelativeToPICBase(OpFlags)) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } return Result; } /// Creates target global address or external symbol nodes for calls or /// other uses. SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall) const { // Unpack the global address or external symbol. SDLoc dl(Op); const GlobalValue *GV = nullptr; int64_t Offset = 0; const char *ExternalSym = nullptr; if (const auto *G = dyn_cast(Op)) { GV = G->getGlobal(); Offset = G->getOffset(); } else { const auto *ES = cast(Op); ExternalSym = ES->getSymbol(); } // Calculate some flags for address lowering. const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlags; if (ForCall) OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); else OpFlags = Subtarget.classifyGlobalReference(GV, Mod); bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); bool NeedsLoad = isGlobalStubReference(OpFlags); CodeModel::Model M = DAG.getTarget().getCodeModel(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; if (GV) { // Create a target global address if this is a global. If possible, fold the // offset into the global address reference. Otherwise, ADD it on later. // Suppress the folding if Offset is negative: movl foo-1, %eax is not // allowed because if the address of foo is 0, the ELF R_X86_64_32 // relocation will compute to a negative value, which is invalid. int64_t GlobalOffset = 0; if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 && X86::isOffsetSuitableForCodeModel(Offset, M, true)) { std::swap(GlobalOffset, Offset); } Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); } else { // If this is not a global address, this must be an external symbol. Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); } // If this is a direct call, avoid the wrapper if we don't need to do any // loads or adds. This allows SDAG ISel to match direct calls. if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) return Result; Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); // With PIC, the address is actually $g + Offset. if (HasPICReg) { Result = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); } // For globals that require a load from a stub to get the address, emit the // load. if (NeedsLoad) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. if (Offset != 0) Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, DAG.getConstant(Offset, dl, PtrVT)); return Result; } SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); } static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic = false) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SDLoc dl(GA); SDValue TGA; bool UseTLSDESC = DAG.getTarget().useTLSDESC(); if (LocalDynamic && UseTLSDESC) { TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags); auto UI = TGA->use_begin(); // Reuse existing GetTLSADDR node if we can find it. if (UI != TGA->use_end()) return SDValue(*UI->use_begin()->use_begin(), 0); } else { TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); } X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC : LocalDynamic ? X86ISD::TLSBASEADDR : X86ISD::TLSADDR; if (InGlue) { SDValue Ops[] = { Chain, TGA, *InGlue }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } else { SDValue Ops[] = { Chain, TGA }; Chain = DAG.getNode(CallType, dl, NodeTys, Ops); } // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI.setAdjustsStack(true); MFI.setHasCalls(true); SDValue Glue = Chain.getValue(1); SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue); if (!UseTLSDESC) return Ret; const X86Subtarget &Subtarget = DAG.getSubtarget(); unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS; Value *Ptr = Constant::getNullValue(PointerType::get(*DAG.getContext(), Seg)); SDValue Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { SDValue InGlue; SDLoc dl(GA); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue); InGlue = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64 static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX, X86II::MO_TLSGD); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32 static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT) { return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::EAX, X86II::MO_TLSGD); } static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64) { SDLoc dl(GA); // Get the start address of the TLS block for this module. X86MachineFunctionInfo *MFI = DAG.getMachineFunction() .getInfo(); MFI->incNumLocalDynamicTLSAccesses(); SDValue Base; if (Is64Bit) { unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX; Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg, X86II::MO_TLSLD, /*LocalDynamic=*/true); } else { SDValue InGlue; SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue); InGlue = Chain.getValue(1); Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSLDM, /*LocalDynamic=*/true); } // Note: the CleanupLocalDynamicTLSPass will remove redundant computations // of Base. // Build x@dtpoff. unsigned char OperandFlags = X86II::MO_DTPOFF; unsigned WrapperKind = X86ISD::Wrapper; SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); // Add x@dtpoff with the base. return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); } // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC) { SDLoc dl(GA); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue( PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is // initialexec. unsigned WrapperKind = X86ISD::Wrapper; if (model == TLSModel::LocalExec) { OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; } else if (model == TLSModel::InitialExec) { if (is64Bit) { OperandFlags = X86II::MO_GOTTPOFF; WrapperKind = X86ISD::WrapperRIP; } else { OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; } } else { llvm_unreachable("Unexpected model"); } // emit "addl x@ntpoff,%eax" (local exec) // or "addl x@indntpoff,%eax" (initial exec) // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags); SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); if (model == TLSModel::InitialExec) { if (isPIC && !is64Bit) { Offset = DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, MachinePointerInfo::getGOT(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool PositionIndependent = isPositionIndependent(); if (Subtarget.isTargetELF()) { TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: if (Subtarget.is64Bit()) { if (Subtarget.isTarget64BitLP64()) return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT); } return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); case TLSModel::LocalDynamic: return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(), Subtarget.isTarget64BitLP64()); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), PositionIndependent); } llvm_unreachable("Unknown TLS model."); } if (Subtarget.isTargetDarwin()) { // Darwin only has one model of TLS. Lower to that. unsigned char OpFlag = 0; unsigned WrapperKind = 0; // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); if (PIC32) { OpFlag = X86II::MO_TLVP_PIC_BASE; WrapperKind = X86ISD::Wrapper; } else { OpFlag = X86II::MO_TLVP; WrapperKind = X86ISD::WrapperRIP; } SDLoc DL(Op); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); // With PIC32, the address is actually $g + Offset. if (PIC32) Offset = DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Offset); // Lowering the machine isd will make sure everything is in the right // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // And our return value (tls address) is in the standard call return value // location. unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); } if (Subtarget.isOSWindows()) { // Just use the implicit TLS architecture // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) // mov rcx, qword [rdx+rcx*8] // mov eax, .tls$:tlsvar // [rax+rcx] contains the address // Windows 64bit: gs:0x58 // Windows 32bit: fs:__tls_array SDLoc dl(GA); SDValue Chain = DAG.getEntryNode(); // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly // use its literal value of 0x2C. Value *Ptr = Constant::getNullValue( Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256) : PointerType::get(*DAG.getContext(), 257)); SDValue TlsArray = Subtarget.is64Bit() ? DAG.getIntPtrConstant(0x58, dl) : (Subtarget.isTargetWindowsGNU() ? DAG.getIntPtrConstant(0x2C, dl) : DAG.getExternalSymbol("_tls_array", PtrVT)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr)); SDValue res; if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { res = ThreadPointer; } else { // Load the _tls_index variable SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); if (Subtarget.is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, MachinePointerInfo(), MVT::i32); else IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); const DataLayout &DL = DAG.getDataLayout(); SDValue Scale = DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8); IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); } res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); // Get the offset of start of .tls section SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), X86II::MO_SECREL); SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); } llvm_unreachable("TLS not implemented for this target."); } bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const { if (Subtarget.is64Bit() && Subtarget.isTargetELF()) { const TargetMachine &TM = getTargetMachine(); TLSModel::Model Model = TM.getTLSModel(&GV); switch (Model) { case TLSModel::LocalExec: case TLSModel::InitialExec: // We can include the %fs segment register in addressing modes. return true; case TLSModel::LocalDynamic: case TLSModel::GeneralDynamic: // These models do not result in %fs relative addresses unless // TLS descriptior are used. // // Even in the case of TLS descriptors we currently have no way to model // the difference between %fs access and the computations needed for the // offset and returning `true` for TLS-desc currently duplicates both // which is detrimental :-/ return false; } } return false; } /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. /// TODO: Can this be moved to general expansion code? static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue Lo, Hi; DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG); return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); } // Try to use a packed vector operation to handle i64 on 32-bit targets when // AVX512DQ is enabled. static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() || (VT != MVT::f32 && VT != MVT::f64)) return SDValue(); // Pack the i64 into a vector, do the operation and extract. // Using 256-bit to ensure result is 128-bits for f32 case. unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts); MVT VecVT = MVT::getVectorVT(VT, NumElts); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); if (IsStrict) { SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, {Op.getOperand(0), InVec}); SDValue Chain = CvtVec.getValue(1); SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Value, Chain}, dl); } SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } // Try to use a packed vector operation to handle i64 on 32-bit targets. static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16) return SDValue(); // Pack the i64 into a vector, do the operation and extract. assert(Subtarget.hasFP16() && "Expected FP16"); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); if (IsStrict) { SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other}, {Op.getOperand(0), InVec}); SDValue Chain = CvtVec.getValue(1); SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Value, Chain}, dl); } SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget) { switch (Opcode) { case ISD::SINT_TO_FP: // TODO: Handle wider types with AVX/AVX512. if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) return false; // CVTDQ2PS or (V)CVTDQ2PD return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); case ISD::UINT_TO_FP: // TODO: Handle wider types and i64 elements. if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) return false; // VCVTUDQ2PS or VCVTUDQ2PD return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; default: return false; } } /// Given a scalar cast operation that is extracted from a vector, try to /// vectorize the cast op followed by extraction. This will avoid an expensive /// round-trip between XMM and GPR. static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: This could be enhanced to handle smaller integer types by peeking // through an extend. SDValue Extract = Cast.getOperand(0); MVT DestVT = Cast.getSimpleValueType(); if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Extract.getOperand(1))) return SDValue(); // See if we have a 128-bit vector cast op for this type of cast. SDValue VecOp = Extract.getOperand(0); MVT FromVT = VecOp.getSimpleValueType(); unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) return SDValue(); // If we are extracting from a non-zero element, first shuffle the source // vector to allow extracting from element zero. if (!isNullConstant(Extract.getOperand(1))) { SmallVector Mask(FromVT.getVectorNumElements(), -1); Mask[0] = Extract.getConstantOperandVal(1); VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); } // If the source vector is wider than 128-bits, extract the low part. Do not // create an unnecessarily wide vector cast op. if (FromVT != Vec128VT) VecOp = extract128BitVector(VecOp, 0, DAG, DL); // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, DAG.getIntPtrConstant(0, DL)); } /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), /// try to vectorize the cast ops. This will avoid an expensive round-trip /// between XMM and GPR. static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: Allow FP_TO_UINT. SDValue CastToInt = CastToFP.getOperand(0); MVT VT = CastToFP.getSimpleValueType(); if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector()) return SDValue(); MVT IntVT = CastToInt.getSimpleValueType(); SDValue X = CastToInt.getOperand(0); MVT SrcVT = X.getSimpleValueType(); if (SrcVT != MVT::f32 && SrcVT != MVT::f64) return SDValue(); // See if we have 128-bit vector cast instructions for this type of cast. // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd. if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) || IntVT != MVT::i32) return SDValue(); unsigned SrcSize = SrcVT.getSizeInBits(); unsigned IntSize = IntVT.getSizeInBits(); unsigned VTSize = VT.getSizeInBits(); MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize); MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize); MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize); // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64. unsigned ToIntOpcode = SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT; unsigned ToFPOpcode = IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP; // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0 // // We are not defining the high elements (for example, zero them) because // that could nullify any performance advantage that we hoped to gain from // this vector op hack. We do not expect any adverse effects (like denorm // penalties) with cast ops. SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X); SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX); SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx); } static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsStrict = Op->isStrictFPOpcode(); MVT VT = Op->getSimpleValueType(0); SDValue Src = Op->getOperand(IsStrict ? 1 : 0); if (Subtarget.hasDQI()) { assert(!Subtarget.hasVLX() && "Unexpected features"); assert((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"); // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && "Unexpected VT!"); MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) : DAG.getUNDEF(MVT::v8i64); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, DAG.getIntPtrConstant(0, DL)); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, {Op->getOperand(0), Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, DL); return Res; } bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP || Op->getOpcode() == ISD::STRICT_SINT_TO_FP; if (VT != MVT::v4f32 || IsSigned) return SDValue(); SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); SDValue One = DAG.getConstant(1, DL, MVT::v4i64); SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT); SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src); SmallVector SignCvts(4); SmallVector Chains(4); for (int i = 0; i != 4; ++i) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, DAG.getIntPtrConstant(i, DL)); if (IsStrict) { SignCvts[i] = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, {Op.getOperand(0), Elt}); Chains[i] = SignCvts[i].getValue(1); } else { SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt); } } SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); SDValue Slow, Chain; if (IsStrict) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other}, {Chain, SignCvt, SignCvt}); Chain = Slow.getValue(1); } else { Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt); } IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg); SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt); if (IsStrict) return DAG.getMergeValues({Cvt, Chain}, DL); return Cvt; } static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); MVT VT = Op.getSimpleValueType(); MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; SDValue Rnd = DAG.getIntPtrConstant(0, dl); if (IsStrict) return DAG.getNode( ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, {Chain, DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}), Rnd}); return DAG.getNode(ISD::FP_ROUND, dl, VT, DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd); } static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget) { if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned) return true; if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned) return true; if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32)) return true; if (Subtarget.useAVX512Regs()) { if (VT == MVT::v16i32) return true; if (VT == MVT::v8i64 && Subtarget.hasDQI()) return true; } if (Subtarget.hasDQI() && Subtarget.hasVLX() && (VT == MVT::v2i64 || VT == MVT::v4i64)) return true; return false; } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDValue Src = Op.getOperand(OpNo); SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); if (isSoftF16(VT, Subtarget)) return promoteXINT_TO_FP(Op, dl, DAG); else if (isLegalConversion(SrcVT, true, Subtarget)) return Op; if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) return LowerWin64_INT128_TO_FP(Op, DAG); if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget)) return Extract; if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget)) return R; if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { // Note: Since v2f64 is a legal type. We don't need to zero extend the // source for strict FP. if (IsStrict) return DAG.getNode( X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64) return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget); return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); bool UseSSEReg = isScalarFPTypeInSSEReg(VT); // These are really Legal; return the operand so the caller accepts it as // Legal. if (SrcVT == MVT::i32 && UseSSEReg) return Op; if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit()) return Op; if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget)) return V; if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget)) return V; // SSE doesn't have an i16 conversion so we need to promote. if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {Chain, Ext}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); } if (VT == MVT::f128 || !Subtarget.hasX87()) return SDValue(); SDValue ValueToStore = Src; if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); unsigned Size = SrcVT.getStoreSize(); Align Alignment(Size); MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment); std::pair Tmp = BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); return Tmp.first; } std::pair X86TargetLowering::BuildFILD( EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const { // Build the FILD SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(DstVT); if (useSSE) Tys = DAG.getVTList(MVT::f80, MVT::Other); else Tys = DAG.getVTList(DstVT, MVT::Other); SDValue FILDOps[] = {Chain, Pointer}; SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo, Alignment, MachineMemOperand::MOLoad); Chain = Result.getValue(1); if (useSSE) { MachineFunction &MF = DAG.getMachineFunction(); unsigned SSFISize = DstVT.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); SDValue FSTOps[] = {Chain, Result, StackSlot}; MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), MachineMemOperand::MOStore, SSFISize, Align(SSFISize)); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO); Result = DAG.getLoad( DstVT, DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); Chain = Result.getValue(1); } return { Result, Chain }; } /// Horizontal vector math instructions may be slower than normal math with /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsOptimizingSize = DAG.shouldOptForSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0 // when converting 0 when rounding toward negative infinity. Caller will // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode. assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!"); // This algorithm is not obvious. Here it is what we're trying to output: /* movq %rax, %xmm0 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } #ifdef __SSE3__ haddpd %xmm0, %xmm0 #else pshufd $0x4e, %xmm0, %xmm1 addpd %xmm1, %xmm0 #endif */ LLVMContext *Context = DAG.getContext(); // Build some magic constants. static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16)); SmallVector CV1; CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)))); CV1.push_back( ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16)); // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); SDValue CLod0 = DAG.getLoad( MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); SDValue CLod1 = DAG.getLoad( MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, DAG.getIntPtrConstant(0, dl)); return Result; } /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP( llvm::bit_cast(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Load), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); if (Op.getNode()->isStrictFPOpcode()) { // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Chain = Op.getOperand(0); SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, {Chain, Or, Bias}); if (Op.getValueType() == Sub.getValueType()) return Sub; // Handle final rounding. std::pair ResultPair = DAG.getStrictFPExtendOrRound( Sub, Sub.getValue(1), dl, Op.getSimpleValueType()); return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl); } // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType()); } static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); bool IsStrict = Op->isStrictFPOpcode(); SDValue N0 = Op.getOperand(IsStrict ? 1 : 0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); if (Subtarget.hasAVX512()) { if (!Subtarget.hasVLX()) { // Let generic type legalization widen this. if (!IsStrict) return SDValue(); // Otherwise pad the integer input with 0s and widen the operation. N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getConstant(0, DL, MVT::v2i32)); SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other}, {Op.getOperand(0), N0}); SDValue Chain = Res.getValue(1); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res, DAG.getIntPtrConstant(0, DL)); return DAG.getMergeValues({Res, Chain}, DL); } // Legalize to v4i32 type. N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); if (IsStrict) return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other}, {Op.getOperand(0), N0}); return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); } // Zero extend to 2i64, OR with the floating point representation of 2^52. // This gives us the floating point equivalent of 2^52 + the i32 integer // since double has 52-bits of mantissa. Then subtract 2^52 in floating // point leaving just our i32 integers in double format. SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0); SDValue VBias = DAG.getConstantFP( llvm::bit_cast(0x4330000000000000ULL), DL, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); if (IsStrict) return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other}, {Op.getOperand(0), Or, VBias}); return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsStrict = Op->isStrictFPOpcode(); SDValue V = Op->getOperand(IsStrict ? 1 : 0); MVT VecIntVT = V.getSimpleValueType(); assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type"); if (Subtarget.hasAVX512()) { // With AVX512, but not VLX we need to widen to get a 512-bit result type. assert(!Subtarget.hasVLX() && "Unexpected features"); MVT VT = Op->getSimpleValueType(0); // v8i32->v8f64 is legal with AVX512 so just return it. if (VT == MVT::v8f64) return Op; assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && "Unexpected VT!"); MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT); V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V, DAG.getIntPtrConstant(0, DL)); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other}, {Op->getOperand(0), V}); Chain = Res.getValue(1); } else { Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, DL); return Res; } if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 && Op->getSimpleValueType(0) == MVT::v4f64) { SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V); Constant *Bias = ConstantFP::get( *DAG.getContext(), APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL))); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8)); SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; SDValue VBias = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8), MachineMemOperand::MOLoad); SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, DAG.getBitcast(MVT::v4i64, VBias)); Or = DAG.getBitcast(MVT::v4f64, Or); if (IsStrict) return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other}, {Op.getOperand(0), Or, VBias}); return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias); } // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); // #else // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; // uint4 hi = (v >> 16) | (uint4) 0x53000000; // #endif // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 // -- 0x53000000 // - A shift: // -- v >> 16 // Create the splat vector for 0x4b000000. SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); // Create the splat vector for 0x53000000. SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); // Create the right shift. SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); SDValue Low, High; if (Subtarget.hasSSE41()) { MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); // uint4 hi = (v >> 16) | (uint4) 0x53000000; High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } // Create the vector constant for (0x1.0p39f + 0x1.0p23f). SDValue VecCstFSub = DAG.getConstantFP( APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // NOTE: By using fsub of a positive constant instead of fadd of a negative // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? // (float4) lo; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); // return (float4) lo + fhi; if (IsStrict) { SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other}, {Op.getOperand(0), HighBitcast, VecCstFSub}); return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other}, {FHigh.getValue(1), LowBitcast, FHigh}); } SDValue FHigh = DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); case MVT::v2i32: return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget); case MVT::v4i32: case MVT::v8i32: return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget); case MVT::v2i64: case MVT::v4i64: return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); unsigned OpNo = IsStrict ? 1 : 0; SDValue Src = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); MVT SrcVT = Src.getSimpleValueType(); MVT DstVT = Op->getSimpleValueType(0); SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); // Bail out when we don't have native conversion instructions. if (DstVT == MVT::f128) return SDValue(); if (isSoftF16(DstVT, Subtarget)) return promoteXINT_TO_FP(Op, dl, DAG); else if (isLegalConversion(SrcVT, false, Subtarget)) return Op; if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget); if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) return LowerWin64_INT128_TO_FP(Op, DAG); if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget)) return Extract; if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. return Op; } // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, {Chain, Src}); return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); } if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget)) return V; if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget)) return V; // The transform for i64->f64 isn't correct for 0 when rounding to negative // infinity. It produces -0.0, so disable under strictfp. if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() && !IsStrict) return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget); // The transform for i32->f64/f32 isn't correct for 0 when rounding to // negative infinity. So disable under strictfp. Using FILD instead. if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 && !IsStrict) return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && (DstVT == MVT::f32 || DstVT == MVT::f64)) return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8); int SSFI = cast(StackSlot)->getIndex(); Align SlotAlign(8); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl); SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MPI.getWithOffset(4), SlotAlign); std::pair Tmp = BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); return Tmp.first; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); SDValue ValueToStore = Src; if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) { // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); } SDValue Store = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. We must be careful to do the computation in x87 extended // precision, not in SSE. SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = {Store, StackSlot}; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI, SlotAlign, MachineMemOperand::MOLoad); Chain = Fild.getValue(1); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits. APInt FF(64, 0x5F80000000000000ULL); SDValue FudgePtr = DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT); Align CPAlignment = cast(FudgePtr)->getAlign(); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, CPAlignment); Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? if (IsStrict) { unsigned Opc = ISD::STRICT_FADD; // Windows needs the precision control changed to 80bits around this add. if (Subtarget.isOSWindows() && DstVT == MVT::f32) Opc = X86ISD::STRICT_FP80_ADD; SDValue Add = DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge}); // STRICT_FP_ROUND can't handle equal types. if (DstVT == MVT::f80) return Add; return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); } unsigned Opc = ISD::FADD; // Windows needs the precision control changed to 80bits around this add. if (Subtarget.isOSWindows() && DstVT == MVT::f32) Opc = X86ISD::FP80_ADD; SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); } // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), // just return an SDValue(). // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 // to i16, i32 or i64, and we lower it to a legal sequence and return the // result. SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, SDValue &Chain) const { bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); EVT DstTy = Op.getValueType(); SDValue Value = Op.getOperand(IsStrict ? 1 : 0); EVT TheVT = Value.getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. return SDValue(); } // If using FIST to compute an unsigned i64, we'll need some fixup // to handle values above the maximum signed i64. A FIST is always // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; // FIXME: This does not generate an invalid exception if the input does not // fit in i32. PR44019 if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"); // We lower FP->int64 into FISTP64 followed by a load from a temporary // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getStoreSize(); int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { // // Conversion to unsigned i64 is implemented with a select, // depending on whether the source value fits in the range // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // // Adjust = (Value >= Thresh) ? 0x80000000 : 0; // FltOfs = (Value >= Thresh) ? 0x80000000 : 0; // FistSrc = (Value - FltOfs); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. // // Being a power of 2, Thresh is exactly representable in all FP formats. // For X87 we'd like to use the smallest FP type for this constant, but // for DAG type consistency we have to match the FP operand type. APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000)); LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; bool LosesInfo = false; if (TheVT == MVT::f64) // The rounding mode is irrelevant as the conversion should be exact. Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &LosesInfo); else if (TheVT == MVT::f80) Status = Thresh.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &LosesInfo); assert(Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"); SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); EVT ResVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT); SDValue Cmp; if (IsStrict) { Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain, /*IsSignaling*/ true); Chain = Cmp.getValue(1); } else { Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE); } // Our preferred lowering of // // (Value >= Thresh) ? 0x8000000000000000ULL : 0 // // is // // (Value >= Thresh) << 63 // // but since we can get here after LegalOperations, DAGCombine might do the // wrong thing if we create a select. So, directly create the preferred // version. SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp); SDValue Const63 = DAG.getConstant(63, DL, MVT::i8); Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63); SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal, DAG.getConstantFP(0.0, DL, TheVT)); if (IsStrict) { Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, { Chain, Value, FltOfs }); Chain = Value.getValue(1); } else Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs); } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Chain, StackSlot }; unsigned FLDSize = TheVT.getStoreSize(); assert(FLDSize <= MemSize && "Stack slot not big enough"); MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize)); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); } // Build the FP_TO_INT*_IN_MEM MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize)); SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI); Chain = Res.getValue(1); // If we need an unsigned fixup, XOR the result with adjust. if (UnsignedFixup) Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); return Res; } static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); unsigned Opc = Op.getOpcode(); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && "Unexpected extension opcode"); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && "Unexpected element type"); assert((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc); if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { assert(InVT == MVT::v32i8 && "Unexpected VT!"); return splitVectorIntUnary(Op, DAG, dl); } if (Subtarget.hasInt256()) return Op; // Optimize vectors in AVX mode: // // v8i16 -> v8i32 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32. // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. // Concat upper and lower parts. // // v4i32 -> v4i64 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64. // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); // Short-circuit if we can determine that each 128-bit half is the same value. // Otherwise, this is difficult to match and optimize. if (auto *Shuf = dyn_cast(In)) if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); SDValue ZeroVec = DAG.getConstant(0, dl, InVT); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Opc == ISD::ZERO_EXTEND; SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); OpHi = DAG.getBitcast(HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } // Helper to split and extend a v16i1 mask to v16i8 or v16i16. static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG) { assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, DAG.getIntPtrConstant(0, dl)); SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, DAG.getIntPtrConstant(8, dl)); Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo); Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi); return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); unsigned NumElts = VT.getVectorNumElements(); // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This // avoids a constant pool load. if (VT.getVectorElementType() != MVT::i8) { SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In); return DAG.getNode(ISD::SRL, DL, VT, Extend, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); } // Extend VT if BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI()) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG); ExtVT = MVT::getVectorVT(MVT::i32, NumElts); } // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, DL)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue One = DAG.getConstant(1, DL, WideVT); SDValue Zero = DAG.getConstant(0, DL, WideVT); SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); // Truncate if we had to extend above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(MVT::i8, NumElts); SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, DAG.getIntPtrConstant(0, DL)); return SelectedVal; } static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); SDLoc DL(Op); if (SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG); assert(Subtarget.hasAVX() && "Expected AVX support"); return LowerAVXExtend(Op, DL, DAG, Subtarget); } /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. /// It makes use of the fact that vectors with enough leading sign/zero bits /// prevent the PACKSS/PACKUS from saturating the results. /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates /// within each 128-bit lane. static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && "Unexpected PACK opcode"); assert(DstVT.isVector() && "VT not a vector?"); // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below). if (!Subtarget.hasSSE2()) return SDValue(); EVT SrcVT = In.getValueType(); // No truncation required, we might get here due to recursive calls. if (SrcVT == DstVT) return In; unsigned NumElems = SrcVT.getVectorNumElements(); if (NumElems < 2 || !isPowerOf2_32(NumElems) ) return SDValue(); unsigned DstSizeInBits = DstVT.getSizeInBits(); unsigned SrcSizeInBits = SrcVT.getSizeInBits(); assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation"); LLVMContext &Ctx = *DAG.getContext(); EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); // Pack to the largest type possible: // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. EVT InVT = MVT::i16, OutVT = MVT::i8; if (SrcVT.getScalarSizeInBits() > 16 && (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { InVT = MVT::i32; OutVT = MVT::i16; } // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half. // On pre-AVX512, pack the src in both halves to help value tracking. if (SrcSizeInBits <= 128) { InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits()); In = widenSubVector(In, false, Subtarget, DAG, DL, 128); SDValue LHS = DAG.getBitcast(InVT, In); SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS; SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS); Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2); Res = DAG.getBitcast(PackedVT, Res); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } // Split lower/upper subvectors. SDValue Lo, Hi; std::tie(Lo, Hi) = splitVector(In, DAG, DL); // If Hi is undef, then don't bother packing it and widen the result instead. if (Hi.isUndef()) { EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx); if (SDValue Res = truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget)) return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits); } unsigned SubSizeInBits = SrcSizeInBits / 2; InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. if (SrcVT.is256BitVector() && DstVT.is128BitVector()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); return DAG.getBitcast(DstVT, Res); } // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { Lo = DAG.getBitcast(InVT, Lo); Hi = DAG.getBitcast(InVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. SmallVector Mask; int Scale = 64 / OutVT.getScalarSizeInBits(); narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask); Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) return DAG.getBitcast(DstVT, Res); // If 512bit -> 128bit truncate another stage. Res = DAG.getBitcast(PackedVT, Res); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } // Recursively pack lower/upper subvectors, concat result and pack again. assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"); if (PackedVT.is128BitVector()) { // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after // type legalization. SDValue Res = truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2); Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget); Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } /// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS. /// e.g. trunc <8 x i32> X to <8 x i16> --> /// MaskX = X & 0xffff (clear high bits to prevent saturation) /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1) static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { In = DAG.getZeroExtendInReg(In, DL, DstVT); return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget); } /// Truncate using inreg sign extension and X86ISD::PACKSS. static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { EVT SrcVT = In.getValueType(); In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In, DAG.getValueType(DstVT)); return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget); } /// Helper to determine if \p In truncated to \p DstVT has the necessary /// signbits / leading zero bits to be truncated with PACKSS / PACKUS, /// possibly by converting a SRL node to SRA for sign extension. static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Requires SSE2. if (!Subtarget.hasSSE2()) return SDValue(); EVT SrcVT = In.getValueType(); EVT DstSVT = DstVT.getVectorElementType(); EVT SrcSVT = SrcVT.getVectorElementType(); unsigned NumDstEltBits = DstSVT.getSizeInBits(); unsigned NumSrcEltBits = SrcSVT.getSizeInBits(); // Check we have a truncation suited for PACKSS/PACKUS. if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) && (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32))) return SDValue(); assert(NumSrcEltBits > NumDstEltBits && "Bad truncation"); unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits); // Truncation from 128-bit to vXi32 can be better handled with PSHUFD. // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW. // Truncation from v2i64 to v2i8 can be better handled with PSHUFB. if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) || (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) || (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3())) return SDValue(); // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply // split this for packing. if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 && !isFreeToSplitVector(In.getNode(), DAG) && (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64)) return SDValue(); // Don't truncate AVX512 targets as multiple PACK nodes stages. if (Subtarget.hasAVX512() && NumStages > 1) return SDValue(); unsigned NumPackedSignBits = std::min(NumDstEltBits, 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; // Truncate with PACKUS if we are truncating a vector with leading zero // bits that extend all the way to the packed/truncated value. // e.g. Masks, zext_in_reg, etc. // Pre-SSE41 we can only use PACKUSWB. KnownBits Known = DAG.computeKnownBits(In); if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) { PackOpcode = X86ISD::PACKUS; return In; } // Truncate with PACKSS if we are truncating a vector with sign-bits // that extend all the way to the packed/truncated value. // e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to // see through BITCASTs later on and combines/simplifications can't then use // it. if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits && !Subtarget.hasAVX512()) return SDValue(); unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits; if (MinSignBits < NumSignBits) { PackOpcode = X86ISD::PACKSS; return In; } // If we have a srl that only generates signbits that we will discard in // the truncation then we can use PACKSS by converting the srl to a sra. // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. if (In.getOpcode() == ISD::SRL && In->hasOneUse()) if (std::optional ShAmt = DAG.getValidShiftAmount(In)) { if (*ShAmt == MinSignBits) { PackOpcode = X86ISD::PACKSS; return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops()); } } return SDValue(); } /// This function lowers a vector truncation of 'extended sign-bits' or /// 'extended zero-bits' values. /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SrcVT = In.getSimpleValueType(); MVT DstSVT = DstVT.getVectorElementType(); MVT SrcSVT = SrcVT.getVectorElementType(); if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) && (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32))) return SDValue(); // If the upper half of the source is undef, then attempt to split and // only truncate the lower half. if (DstVT.getSizeInBits() >= 128) { SmallVector LowerOps; if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) { MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(); if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL, Subtarget, DAG)) return widenSubVector(Res, false, Subtarget, DAG, DL, DstVT.getSizeInBits()); } } unsigned PackOpcode; if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget)) return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget); return SDValue(); } /// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into /// X86ISD::PACKUS/X86ISD::PACKSS operations. static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT SrcVT = In.getSimpleValueType(); MVT DstSVT = DstVT.getVectorElementType(); MVT SrcSVT = SrcVT.getVectorElementType(); unsigned NumElems = DstVT.getVectorNumElements(); if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) && (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) && NumElems >= 8)) return SDValue(); // SSSE3's pshufb results in less instructions in the cases below. if (Subtarget.hasSSSE3() && NumElems == 8) { if (SrcSVT == MVT::i16) return SDValue(); if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41())) return SDValue(); } // If the upper half of the source is undef, then attempt to split and // only truncate the lower half. if (DstVT.getSizeInBits() >= 128) { SmallVector LowerOps; if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) { MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(); if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG)) return widenSubVector(Res, false, Subtarget, DAG, DL, DstVT.getSizeInBits()); } } // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to // truncate 2 x v4i32 to v8i16. if (Subtarget.hasSSE41() || DstSVT == MVT::i8) return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG); if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32) return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG); // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS. if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) { MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In); return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG); } return SDValue(); } static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."); // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; if (InVT.getScalarSizeInBits() <= 16) { if (Subtarget.hasBWI()) { // legal, will go to VPMOVB2M, VPMOVW2M if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. // Shift packed bytes not supported natively, bitcast to word MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In), DAG.getConstant(ShiftInx, DL, ExtVT)); In = DAG.getBitcast(InVT, In); } return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); } // Use TESTD/Q, extended vector to packed dword/qword. assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); unsigned NumElts = InVT.getVectorNumElements(); assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"); // We need to change to a wider element type that we have support for. // For 8 element vectors this is easy, we either extend to v8i32 or v8i64. // For 16 element vectors we extend to v16i32 unless we are explicitly // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors // we need to split into two 8 element vectors which we can extend to v8i32, // truncate and concat the results. There's an additional complication if // the original type is v16i8. In that case we can't split the v16i8 // directly, so we need to shuffle high elements to low and use // sign_extend_vector_inreg. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) { SDValue Lo, Hi; if (InVT == MVT::v16i8) { Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In); Hi = DAG.getVectorShuffle( InVT, DL, In, In, {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi); } else { assert(InVT == MVT::v16i16 && "Unexpected VT!"); Lo = extract128BitVector(In, 0, DAG, DL); Hi = extract128BitVector(In, 8, DAG, DL); } // We're split now, just emit two truncates and a concat. The two // truncates will trigger legalization to come back to this function. Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo); Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } // We either have 8 elements or we're allowed to use 512-bit vectors. // If we have VLX, we want to use the narrowest vector that can get the // job done so we use vXi32. MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; } if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { // We need to shift to get the lsb into sign position. In = DAG.getNode(ISD::SHL, DL, InVT, In, DAG.getConstant(ShiftInx, DL, InVT)); } // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m. if (Subtarget.hasDQI()) return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE); } SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); // If we're called by the type legalizer, handle a few cases. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) { if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && VT.is128BitVector() && Subtarget.hasAVX512()) { assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) && "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and // concatenate those. SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(In, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo); Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS. if (!Subtarget.hasAVX512() || (InVT.is512BitVector() && VT.is256BitVector())) if (SDValue SignPack = LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG)) return SignPack; // Pre-AVX512 see if we can make use of PACKSS/PACKUS. if (!Subtarget.hasAVX512()) return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG); // Otherwise let default legalization handle it. return SDValue(); } if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DL, DAG, Subtarget); // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to // concat from subvectors to use VPTRUNC etc. if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) if (SDValue SignPack = LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG)) return SignPack; // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) { assert(VT == MVT::v32i8 && "Unexpected VT!"); return splitVectorIntUnary(Op, DAG, DL); } // word to byte only under BWI. Otherwise we have to promoted to v16i32 // and then truncate that. But we should only do that if we haven't been // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be // handled by isel patterns. if (InVT != MVT::v16i16 || Subtarget.hasBWI() || Subtarget.canExtendTo512DQ()) return Op; } // Handle truncation of V256 to V128 using shuffles. assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(2, DL)); static const int ShufMask[] = {0, 2, 4, 6}; return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo), DAG.getBitcast(MVT::v4i32, OpHi), ShufMask); } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { // The PSHUFB mask: static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1 }; In = DAG.getBitcast(MVT::v32i8, In); In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); static const int ShufMask2[] = {0, 2, -1, -1}; In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(MVT::v8i16, In); } return Subtarget.hasSSE41() ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG) : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG); } if (VT == MVT::v16i8 && InVT == MVT::v16i16) return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG); llvm_unreachable("All 256->128 cases should have been handled above!"); } // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction // behaves on out of range inputs to generate optimized conversions. static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT SrcVT = Src.getSimpleValueType(); unsigned DstBits = VT.getScalarSizeInBits(); assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"); // Calculate the converted result for values in the range 0 to // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src); SDValue Big = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, DAG.getNode(ISD::FSUB, dl, SrcVT, Src, DAG.getConstantFP(2147483648.0f, dl, SrcVT))); // The "CVTTP2SI" instruction conveniently sets the sign bit if // and only if the value was out of range. So we can use that // as our indicator that we rather use "Big" instead of "Small". // // Use "Small" if "IsOverflown" has all bits cleared // and "0x80000000 | Big" if all bits in "IsOverflown" are set. // AVX1 can't use the signsplat masking for 256-bit vectors - we have to // use the slightly slower blendv select instead. if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) { SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big); return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small); } SDValue IsOverflown = DAG.getNode(X86ISD::VSRAI, dl, VT, Small, DAG.getTargetConstant(DstBits - 1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, Small, DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || Op.getOpcode() == ISD::STRICT_FP_TO_SINT; MVT VT = Op->getSimpleValueType(0); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue(); MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); SDValue Res; if (isSoftF16(SrcVT, Subtarget)) { MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; if (IsStrict) return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other}, {Chain, Src})}); return DAG.getNode(Op.getOpcode(), dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) { return Op; } if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc; if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; else Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { assert(Subtarget.useAVX512Regs() && "Unexpected features!"); // Widen to 512-bits. ResVT = MVT::v8i32; TruncVT = MVT::v8i1; Opc = Op.getOpcode(); // Need to concat with zero vector for strict fp to avoid spurious // exceptions. // TODO: Should we just do this for non-strict as well? SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64) : DAG.getUNDEF(MVT::v8f64); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src, DAG.getIntPtrConstant(0, dl)); } if (IsStrict) { Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Opc, dl, ResVT, Src); } Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) { if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) return Op; MVT ResVT = VT; MVT EleVT = VT.getVectorElementType(); if (EleVT != MVT::i64) ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; if (SrcVT != MVT::v8f16) { SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT); SmallVector Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp); Ops[0] = Src; Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); } if (IsStrict) { Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI, dl, {ResVT, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, ResVT, Src); } // TODO: Need to add exception check code for strict FP. if (EleVT.getSizeInBits() < 16) { ResVT = MVT::getVectorVT(EleVT, 8); Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res); } if (ResVT != VT) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first. if (VT.getVectorElementType() == MVT::i16) { assert((SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && "Expected f32/f64 vector!"); MVT NVT = VT.changeVectorElementType(MVT::i32); if (IsStrict) { Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT : ISD::STRICT_FP_TO_UINT, dl, {NVT, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, NVT, Src); } // TODO: Need to add exception check code for strict FP. Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32. if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) { assert(!IsSigned && "Expected unsigned conversion!"); assert(Subtarget.useAVX512Regs() && "Requires avx512f"); return Op; } // Widen vXi32 fp_to_uint with avx512f to 512-bit source. if ((VT == MVT::v4i32 || VT == MVT::v8i32) && (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) && Subtarget.useAVX512Regs()) { assert(!IsSigned && "Expected unsigned conversion!"); assert(!Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. // TODO: Should we just do this for non-strict as well? SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, DAG.getIntPtrConstant(0, dl)); if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. if ((VT == MVT::v2i64 || VT == MVT::v4i64) && (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) && Subtarget.useAVX512Regs() && Subtarget.hasDQI()) { assert(!Subtarget.hasVLX() && "Unexpected features!"); MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; // Need to concat with zero vector for strict fp to avoid spurious // exceptions. // TODO: Should we just do this for non-strict as well? SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, DAG.getIntPtrConstant(0, dl)); if (IsStrict) { Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src); } Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { if (!Subtarget.hasVLX()) { // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type // legalizer and then widened again by vector op legalization. if (!IsStrict) return SDValue(); SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32); SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32, {Src, Zero, Zero, Zero}); Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, {Chain, Tmp}); SDValue Chain = Tmp.getValue(1); Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp, DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Tmp, Chain}, dl); } assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL"); SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); if (IsStrict) { unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); } unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; return DAG.getNode(Opc, dl, VT, Tmp); } // Generate optimized instructions for pre AVX512 unsigned conversions from // vXf32 to vXi32. if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) || (VT == MVT::v4i32 && SrcVT == MVT::v4f64) || (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) { assert(!IsSigned && "Expected unsigned conversion!"); return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget); } return SDValue(); } assert(!VT.isVector()); bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); if (!IsSigned && UseSSEReg) { // Conversions from f32/f64 with AVX512 should be legal. if (Subtarget.hasAVX512()) return Op; // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction // behaves on out of range inputs to generate optimized conversions. if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) || (VT == MVT::i64 && Subtarget.is64Bit()))) { unsigned DstBits = VT.getScalarSizeInBits(); APInt UIntLimit = APInt::getSignMask(DstBits); SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT, DAG.getConstant(UIntLimit, dl, VT)); MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits()); // Calculate the converted result for values in the range: // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big"). SDValue Small = DAG.getNode(X86ISD::CVTTS2SI, dl, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src)); SDValue Big = DAG.getNode( X86ISD::CVTTS2SI, dl, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset))); // The "CVTTS2SI" instruction conveniently sets the sign bit if // and only if the value was out of range. So we can use that // as our indicator that we rather use "Big" instead of "Small". // // Use "Small" if "IsOverflown" has all bits cleared // and "0x80000000 | Big" if all bits in "IsOverflown" are set. SDValue IsOverflown = DAG.getNode( ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, Small, DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); } // Use default expansion for i64. if (VT == MVT::i64) return SDValue(); assert(VT == MVT::i32 && "Unexpected VT!"); // Promote i32 to i64 and use a signed operation on 64-bit targets. // FIXME: This does not generate an invalid exception if the input does not // fit in i32. PR44019 if (Subtarget.is64Bit()) { if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can // use fisttp which will be handled later. if (!Subtarget.hasSSE3()) return SDValue(); } // Promote i16 to i32 if we can use a SSE operation or the type is f128. // FIXME: This does not generate an invalid exception if the input does not // fit in i16. PR44019 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) { assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other}, {Chain, Src}); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } // If this is a FP_TO_SINT using SSEReg we're done. if (UseSSEReg && IsSigned) return Op; // fp128 needs to use a libcall. if (SrcVT == MVT::f128) { RTLIB::Libcall LC; if (IsSigned) LC = RTLIB::getFPTOSINT(SrcVT, VT); else LC = RTLIB::getFPTOUINT(SrcVT, VT); MakeLibCallOptions CallOptions; std::pair Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain); if (IsStrict) return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); return Tmp.first; } // Fall back to X87. if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { if (IsStrict) return DAG.getMergeValues({V, Chain}, dl); return V; } llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); EVT DstVT = Op.getSimpleValueType(); MVT SrcVT = Src.getSimpleValueType(); if (SrcVT.isVector()) return DstVT.getScalarType() == MVT::i32 ? Op : SDValue(); if (SrcVT == MVT::f16) return SDValue(); // If the source is in an SSE register, the node is Legal. if (isScalarFPTypeInSSEReg(SrcVT)) return Op; return LRINT_LLRINTHelper(Op.getNode(), DAG); } SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const { EVT DstVT = N->getValueType(0); SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) { // f16 must be promoted before using the lowering in this routine. // fp128 does not use this lowering. return SDValue(); } SDLoc DL(N); SDValue Chain = DAG.getEntryNode(); bool UseSSE = isScalarFPTypeInSSEReg(SrcVT); // If we're converting from SSE, the stack slot needs to hold both types. // Otherwise it only needs to hold the DstVT. EVT OtherVT = UseSSE ? SrcVT : DstVT; SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); if (UseSSE) { assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"); Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Chain, StackPtr }; Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI, /*Align*/ std::nullopt, MachineMemOperand::MOLoad); Chain = Src.getValue(1); } SDValue StoreOps[] = { Chain, Src, StackPtr }; Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other), StoreOps, DstVT, MPI, /*Align*/ std::nullopt, MachineMemOperand::MOStore); return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI); } SDValue X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation, // but making use of X86 specifics to produce better instruction sequences. SDNode *Node = Op.getNode(); bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT; unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; SDLoc dl(SDValue(Node, 0)); SDValue Src = Node->getOperand(0); // There are three types involved here: SrcVT is the source floating point // type, DstVT is the type of the result, and TmpVT is the result of the // intermediate FP_TO_*INT operation we'll use (which may be a promotion of // DstVT). EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); EVT TmpVT = DstVT; // This code is only for floats and doubles. Fall back to generic code for // anything else. if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget)) return SDValue(); EVT SatVT = cast(Node->getOperand(1))->getVT(); unsigned SatWidth = SatVT.getScalarSizeInBits(); unsigned DstWidth = DstVT.getScalarSizeInBits(); unsigned TmpWidth = TmpVT.getScalarSizeInBits(); assert(SatWidth <= DstWidth && SatWidth <= TmpWidth && "Expected saturation width smaller than result width"); // Promote result of FP_TO_*INT to at least 32 bits. if (TmpWidth < 32) { TmpVT = MVT::i32; TmpWidth = 32; } // Promote conversions to unsigned 32-bit to 64-bit, because it will allow // us to use a native signed conversion instead. if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) { TmpVT = MVT::i64; TmpWidth = 64; } // If the saturation width is smaller than the size of the temporary result, // we can always use signed conversion, which is native. if (SatWidth < TmpWidth) FpToIntOpcode = ISD::FP_TO_SINT; // Determine minimum and maximum integer values and their corresponding // floating-point values. APInt MinInt, MaxInt; if (IsSigned) { MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth); MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth); } else { MinInt = APInt::getMinValue(SatWidth).zext(DstWidth); MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth); } APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT)); APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT)); APFloat::opStatus MinStatus = MinFloat.convertFromAPInt( MinInt, IsSigned, APFloat::rmTowardZero); APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt( MaxInt, IsSigned, APFloat::rmTowardZero); bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) && !(MaxStatus & APFloat::opStatus::opInexact); SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT); SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT); // If the integer bounds are exactly representable as floats, emit a // min+max+fptoi sequence. Otherwise use comparisons and selects. if (AreExactFloatBounds) { if (DstVT != TmpVT) { // Clamp by MinFloat from below. If Src is NaN, propagate NaN. SDValue MinClamped = DAG.getNode( X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src); // Clamp by MaxFloat from above. If Src is NaN, propagate NaN. SDValue BothClamped = DAG.getNode( X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped); // Convert clamped value to integer. SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped); // NaN will become INDVAL, with the top bit set and the rest zero. // Truncation will discard the top bit, resulting in zero. return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); } // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat. SDValue MinClamped = DAG.getNode( X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode); // Clamp by MaxFloat from above. NaN cannot occur. SDValue BothClamped = DAG.getNode( X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode); // Convert clamped value to integer. SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped); if (!IsSigned) { // In the unsigned case we're done, because we mapped NaN to MinFloat, // which is zero. return FpToInt; } // Otherwise, select zero if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); return DAG.getSelectCC( dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); } SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT); // Result of direct conversion, which may be selected away. SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src); if (DstVT != TmpVT) { // NaN will become INDVAL, with the top bit set and the rest zero. // Truncation will discard the top bit, resulting in zero. FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); } SDValue Select = FpToInt; // For signed conversions where we saturate to the same size as the // result type of the fptoi instructions, INDVAL coincides with integer // minimum, so we don't need to explicitly check it. if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) { // If Src ULT MinFloat, select MinInt. In particular, this also selects // MinInt if Src is NaN. Select = DAG.getSelectCC( dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT); } // If Src OGT MaxFloat, select MaxInt. Select = DAG.getSelectCC( dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT); // In the unsigned case we are done, because we mapped NaN to MinInt, which // is already zero. The promoted case was already handled above. if (!IsSigned || DstVT != TmpVT) { return Select; } // Otherwise, select 0 if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); return DAG.getSelectCC( dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); } SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); // Let f16->f80 get lowered to a libcall, except for darwin, where we should // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available) if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 && !Subtarget.getTargetTriple().isOSDarwin())) return SDValue(); if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) || (SVT == MVT::v16f16 && Subtarget.useAVX512Regs())) return Op; if (SVT == MVT::f16) { if (Subtarget.hasFP16()) return Op; if (VT != MVT::f32) { if (IsStrict) return DAG.getNode( ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other}, {Chain, In})}); return DAG.getNode(ISD::FP_EXTEND, DL, VT, DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In)); } if (!Subtarget.hasF16C()) { if (!Subtarget.getTargetTriple().isOSDarwin()) return SDValue(); assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall"); // Need a libcall, but ABI for f16 is soft-float on MacOS. TargetLowering::CallLoweringInfo CLI(DAG); Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); In = DAG.getBitcast(MVT::i16, In); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = In; Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext()); Entry.IsSExt = false; Entry.IsZExt = true; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol( getLibcallName(RTLIB::FPEXT_F16_F32), getPointerTy(DAG.getDataLayout())); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args)); SDValue Res; std::tie(Res,Chain) = LowerCallTo(CLI); if (IsStrict) Res = DAG.getMergeValues({Res, Chain}, DL); return Res; } In = DAG.getBitcast(MVT::i16, In); In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In, DAG.getIntPtrConstant(0, DL)); SDValue Res; if (IsStrict) { Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other}, {Chain, In}); Chain = Res.getValue(1); } else { Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In, DAG.getTargetConstant(4, DL, MVT::i32)); } Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res, DAG.getIntPtrConstant(0, DL)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, DL); return Res; } if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16) return Op; if (SVT.getVectorElementType() == MVT::f16) { if (Subtarget.hasFP16() && isTypeLegal(SVT)) return Op; assert(Subtarget.hasF16C() && "Unexpected features!"); if (SVT == MVT::v2f16) In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, DAG.getUNDEF(MVT::v2f16)); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In, DAG.getUNDEF(MVT::v4f16)); if (IsStrict) return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, {Op->getOperand(0), Res}); return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); } else if (VT == MVT::v4f64 || VT == MVT::v8f64) { return Op; } assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT)); if (IsStrict) return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, {Op->getOperand(0), Res}); return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); } SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT VT = Op.getSimpleValueType(); MVT SVT = In.getSimpleValueType(); if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) return SDValue(); if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) && !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) { if (!Subtarget.getTargetTriple().isOSDarwin()) return SDValue(); // We need a libcall but the ABI for f16 libcalls on MacOS is soft. TargetLowering::CallLoweringInfo CLI(DAG); Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = In; Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext()); Entry.IsSExt = false; Entry.IsZExt = true; Args.push_back(Entry); SDValue Callee = DAG.getExternalSymbol( getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16 : RTLIB::FPROUND_F32_F16), getPointerTy(DAG.getDataLayout())); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args)); SDValue Res; std::tie(Res, Chain) = LowerCallTo(CLI); Res = DAG.getBitcast(MVT::f16, Res); if (IsStrict) Res = DAG.getMergeValues({Res, Chain}, DL); return Res; } if (VT.getScalarType() == MVT::bf16) { if (SVT.getScalarType() == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) || Subtarget.hasAVXNECONVERT())) return Op; return SDValue(); } if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32) return SDValue(); if (VT.isVector()) return Op; SDValue Res; SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL, MVT::i32); if (IsStrict) { Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32, DAG.getConstantFP(0, DL, MVT::v4f32), In, DAG.getIntPtrConstant(0, DL)); Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other}, {Chain, Res, Rnd}); Chain = Res.getValue(1); } else { // FIXME: Should we use zeros for upper elements for non-strict? Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In); Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd); } Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res, DAG.getIntPtrConstant(0, DL)); Res = DAG.getBitcast(MVT::f16, Res); if (IsStrict) return DAG.getMergeValues({Res, Chain}, DL); return Res; } return Op; } static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && "Unexpected VT!"); SDLoc dl(Op); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, DAG.getConstant(0, dl, MVT::v8i16), Src, DAG.getIntPtrConstant(0, dl)); SDValue Chain; if (IsStrict) { Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other}, {Op.getOperand(0), Res}); Chain = Res.getValue(1); } else { Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); } Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && "Unexpected VT!"); SDLoc dl(Op); SDValue Res, Chain; if (IsStrict) { Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32, DAG.getConstantFP(0, dl, MVT::v4f32), Src, DAG.getIntPtrConstant(0, dl)); Res = DAG.getNode( X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)}); Chain = Res.getValue(1); } else { // FIXME: Should we use zeros for upper elements for non-strict? Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src); Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, DAG.getTargetConstant(4, dl, MVT::i32)); } Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res, DAG.getIntPtrConstant(0, dl)); if (IsStrict) return DAG.getMergeValues({Res, Chain}, dl); return Res; } SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT SVT = Op.getOperand(0).getSimpleValueType(); if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) || Subtarget.hasAVXNECONVERT())) { SDValue Res; Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0)); Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res); Res = DAG.getBitcast(MVT::v8i16, Res); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res, DAG.getIntPtrConstant(0, DL)); } MakeLibCallOptions CallOptions; RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16); SDValue Res = makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first; return DAG.getBitcast(MVT::i16, Res); } /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If both operands have other uses, this is probably not profitable. SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (!LHS.hasOneUse() && !RHS.hasOneUse()) return Op; // FP horizontal add/sub were added with SSE3. Integer with SSSE3. bool IsFP = Op.getSimpleValueType().isFloatingPoint(); if (IsFP && !Subtarget.hasSSE3()) return Op; if (!IsFP && !Subtarget.hasSSSE3()) return Op; // Extract from a common vector. if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getOperand(0) != RHS.getOperand(0) || !isa(LHS.getOperand(1)) || !isa(RHS.getOperand(1)) || !shouldUseHorizontalOp(true, DAG, Subtarget)) return Op; // Allow commuted 'hadd' ops. // TODO: Allow commuted (f)sub by negating the result of (F)HSUB? unsigned HOpcode; switch (Op.getOpcode()) { // clang-format off case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: llvm_unreachable("Trying to lower unsupported opcode to horizontal op"); // clang-format on } unsigned LExtIndex = LHS.getConstantOperandVal(1); unsigned RExtIndex = RHS.getConstantOperandVal(1); if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) std::swap(LExtIndex, RExtIndex); if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) return Op; SDValue X = LHS.getOperand(0); EVT VecVT = X.getValueType(); unsigned BitWidth = VecVT.getSizeInBits(); unsigned NumLanes = BitWidth / 128; unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here"); // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit // equivalent, so extract the 256/512-bit source op to 128-bit if we can. if (BitWidth == 256 || BitWidth == 512) { unsigned LaneIdx = LExtIndex / NumEltsPerLane; X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); LExtIndex %= NumEltsPerLane; } // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, DAG.getIntPtrConstant(LExtIndex / 2, DL)); } /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget); } /// ISD::FROUND is defined to round to nearest with ties rounding away from 0. /// This mode isn't supported in hardware on X86. But as long as we aren't /// compiling with trapping math, we can emulate this with /// trunc(X + copysign(nextafter(0.5, 0.0), X)). static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // N0 += copysign(nextafter(0.5, 0.0), N0) const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); bool Ignored; APFloat Point5Pred = APFloat(0.5f); Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored); Point5Pred.next(/*nextDown*/true); SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT, DAG.getConstantFP(Point5Pred, dl, VT), N0); N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder); // Truncate the result to remove fraction. return DAG.getNode(ISD::FTRUNC, dl, VT, N0); } /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); // If this is a FABS and it has an FNEG user, bail out to fold the combination // into an FNABS. We'll lower the FABS after that if it is still in use. if (IsFABS) for (SDNode *User : Op->uses()) if (User->getOpcode() == ISD::FNEG) return Op; SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); assert(VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Unexpected type in LowerFABSorFNEG"); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. // Using a 16-byte mask allows folding the load of the mask with // the logic op, so it can save (~4 bytes) on code size. bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (VT == MVT::f32) ? MVT::v4f32 : MVT::v8f16; unsigned EltBits = VT.getScalarSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); unsigned LogicOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, // and extract the scalar result back out. Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDValue Mag = Op.getOperand(0); SDValue Sign = Op.getOperand(1); SDLoc dl(Op); // If the sign operand is smaller, extend it first. MVT VT = Op.getSimpleValueType(); if (Sign.getSimpleValueType().bitsLT(VT)) Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign); // And if it is bigger, shrink it first. if (Sign.getSimpleValueType().bitsGT(VT)) Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. bool IsF128 = (VT == MVT::f128); assert(VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); // Perform all scalar logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. // TODO: This isn't necessary. If we used scalar types, we might avoid some // unnecessary splats, but we might miss load folding opportunities. Should // this decision be based on OptimizeForSize? bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (VT == MVT::f32) ? MVT::v4f32 : MVT::v8f16; // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue SignMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); SDValue MagMask = DAG.getConstantFP( APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT); // First, clear all bits but the sign bit from the second operand (sign). if (IsFakeVector) Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); // Next, clear the sign bit from the first operand (magnitude). // TODO: If we had general constant folding for FP logic ops, this check // wouldn't be necessary. SDValue MagBits; if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) { APFloat APF = Op0CN->getValueAPF(); APF.clearSign(); MagBits = DAG.getConstantFP(APF, dl, LogicVT); } else { // If the magnitude operand wasn't a constant, we need to AND out the sign. if (IsFakeVector) Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); } // OR the magnitude value with the sign bit. SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT OpVT = N0.getSimpleValueType(); assert((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"); // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); Res = DAG.getZExtOrTrunc(Res, dl, VT); Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); return Res; } /// Helper for attempting to create a X86ISD::BT node. static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) { // If Src is i8, promote it to i32 with any_extend. There is no i8 BT // instruction. Since the shift amount is in-range-or-undefined, we know // that doing a bittest on the i32 value is ok. We extend to i32 because // the encoding for the i16 version is larger than the i32 version. // Also promote i16 to i32 for performance / code size reason. if (Src.getValueType().getScalarSizeInBits() < 32) Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); // No legal type found, give up. if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType())) return SDValue(); // See if we can use the 32-bit instruction instead of the 64-bit one for a // shorter encoding. Since the former takes the modulo 32 of BitNo and the // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is // known to be zero. if (Src.getValueType() == MVT::i64 && DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); // If the operand types disagree, extend the shift amount to match. Since // BT ignores high bits (like shifts) we can use anyextend. if (Src.getValueType() != BitNo.getValueType()) { // Peek through a mask/modulo operation. // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but // we probably need a better IsDesirableToPromoteOp to handle this as well. if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse()) BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(), DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo.getOperand(0)), DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo.getOperand(1))); else BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo); } return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo); } /// Helper for creating a X86ISD::SETCC node. static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS); } /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a /// recognizable memcmp expansion. static bool isOrXorXorTree(SDValue X, bool Root = true) { if (X.getOpcode() == ISD::OR) return isOrXorXorTree(X.getOperand(0), false) && isOrXorXorTree(X.getOperand(1), false); if (Root) return false; return X.getOpcode() == ISD::XOR; } /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp /// expansion. template static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV) { SDValue Op0 = X.getOperand(0); SDValue Op1 = X.getOperand(1); if (X.getOpcode() == ISD::OR) { SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV); SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV); if (VecVT != CmpVT) return DAG.getNode(ISD::OR, DL, CmpVT, A, B); if (HasPT) return DAG.getNode(ISD::OR, DL, VecVT, A, B); return DAG.getNode(ISD::AND, DL, CmpVT, A, B); } if (X.getOpcode() == ISD::XOR) { SDValue A = SToV(Op0); SDValue B = SToV(Op1); if (VecVT != CmpVT) return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE); if (HasPT) return DAG.getNode(ISD::XOR, DL, VecVT, A, B); return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); } llvm_unreachable("Impossible"); } /// Try to map a 128-bit or larger integer comparison to vector instructions /// before type legalization splits it up into chunks. static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); // We're looking for an oversized integer equality comparison. EVT OpVT = X.getValueType(); unsigned OpSize = OpVT.getSizeInBits(); if (!OpVT.isScalarInteger() || OpSize < 128) return SDValue(); // Ignore a comparison with zero because that gets special treatment in // EmitTest(). But make an exception for the special case of a pair of // logically-combined vector-sized operands compared to zero. This pattern may // be generated by the memcmp expansion pass with oversized integer compares // (see PR33325). bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X); if (isNullConstant(Y) && !IsOrXorXorTreeCCZero) return SDValue(); // Don't perform this combine if constructing the vector will be expensive. auto IsVectorBitCastCheap = [](SDValue X) { X = peekThroughBitcasts(X); return isa(X) || X.getValueType().isVector() || X.getOpcode() == ISD::LOAD; }; if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && !IsOrXorXorTreeCCZero) return SDValue(); // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && Subtarget.hasAVX()) || (OpSize == 512 && Subtarget.useAVX512Regs()))) { bool HasPT = Subtarget.hasSSE41(); // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened // vector registers are essentially free. (Technically, widening registers // prevents load folding, but the tradeoff is worth it.) bool PreferKOT = Subtarget.preferMaskRegisters(); bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512; EVT VecVT = MVT::v16i8; EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT; if (OpSize == 256) { VecVT = MVT::v32i8; CmpVT = PreferKOT ? MVT::v32i1 : VecVT; } EVT CastVT = VecVT; bool NeedsAVX512FCast = false; if (OpSize == 512 || NeedZExt) { if (Subtarget.hasBWI()) { VecVT = MVT::v64i8; CmpVT = MVT::v64i1; if (OpSize == 512) CastVT = VecVT; } else { VecVT = MVT::v16i32; CmpVT = MVT::v16i1; CastVT = OpSize == 512 ? VecVT : OpSize == 256 ? MVT::v8i32 : MVT::v4i32; NeedsAVX512FCast = true; } } auto ScalarToVector = [&](SDValue X) -> SDValue { bool TmpZext = false; EVT TmpCastVT = CastVT; if (X.getOpcode() == ISD::ZERO_EXTEND) { SDValue OrigX = X.getOperand(0); unsigned OrigSize = OrigX.getScalarValueSizeInBits(); if (OrigSize < OpSize) { if (OrigSize == 128) { TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8; X = OrigX; TmpZext = true; } else if (OrigSize == 256) { TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8; X = OrigX; TmpZext = true; } } } X = DAG.getBitcast(TmpCastVT, X); if (!NeedZExt && !TmpZext) return X; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, DAG.getConstant(0, DL, VecVT), X, DAG.getVectorIdxConstant(0, DL)); }; SDValue Cmp; if (IsOrXorXorTreeCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne // Use 2 vector equality compares and 'and' the results before doing a // MOVMSK. Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector); } else { SDValue VecX = ScalarToVector(X); SDValue VecY = ScalarToVector(Y); if (VecVT != CmpVT) { Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE); } else if (HasPT) { Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); } else { Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } } // AVX512 should emit a setcc that will lower to kortest. if (VecVT != CmpVT) { EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16; return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), DAG.getConstant(0, DL, KRegVT), CC); } if (HasPT) { SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp); SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp); X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG); return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0)); } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne assert(Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target"); SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32); return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); } return SDValue(); } /// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) /// style scalarized (associative) reduction patterns. Partial reductions /// are supported when the pointer SrcMask is non-null. /// TODO - move this to SelectionDAG? static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl &SrcOps, SmallVectorImpl *SrcMask = nullptr) { SmallVector Opnds; DenseMap SrcOpMap; EVT VT = MVT::Other; // Recognize a special case where a vector is casted into wide integer to // test all 0s. assert(Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"); Opnds.push_back(Op.getOperand(0)); Opnds.push_back(Op.getOperand(1)); for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { SmallVectorImpl::const_iterator I = Opnds.begin() + Slot; // BFS traverse all BinOp operands. if (I->getOpcode() == unsigned(BinOp)) { Opnds.push_back(I->getOperand(0)); Opnds.push_back(I->getOperand(1)); // Re-evaluate the number of nodes to be traversed. e += 2; // 2 more nodes (LHS and RHS) are pushed. continue; } // Quit if a non-EXTRACT_VECTOR_ELT if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return false; // Quit if without a constant index. auto *Idx = dyn_cast(I->getOperand(1)); if (!Idx) return false; SDValue Src = I->getOperand(0); DenseMap::iterator M = SrcOpMap.find(Src); if (M == SrcOpMap.end()) { VT = Src.getValueType(); // Quit if not the same type. if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType()) return false; unsigned NumElts = VT.getVectorNumElements(); APInt EltCount = APInt::getZero(NumElts); M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; SrcOps.push_back(Src); } // Quit if element already used. unsigned CIdx = Idx->getZExtValue(); if (M->second[CIdx]) return false; M->second.setBit(CIdx); } if (SrcMask) { // Collect the source partial masks. for (SDValue &SrcOp : SrcOps) SrcMask->push_back(SrcOpMap[SrcOp]); } else { // Quit if not all elements are used. for (const auto &I : SrcOpMap) if (!I.second.isAllOnes()) return false; } return true; } // Helper function for comparing all bits of two vectors. static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC) { EVT VT = LHS.getValueType(); unsigned ScalarSize = VT.getScalarSizeInBits(); if (OriginalMask.getBitWidth() != ScalarSize) { assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"); return SDValue(); } // Quit if not convertable to legal scalar or 128/256-bit vector. if (!llvm::has_single_bit(VT.getSizeInBits())) return SDValue(); // FCMP may use ISD::SETNE when nnan - early out if we manage to get here. if (VT.isFloatingPoint()) return SDValue(); assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE); APInt Mask = OriginalMask; auto MaskBits = [&](SDValue Src) { if (Mask.isAllOnes()) return Src; EVT SrcVT = Src.getValueType(); SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT); return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue); }; // For sub-128-bit vector, cast to (legal) integer and compare with zero. if (VT.getSizeInBits() < 128) { EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) { if (IntVT != MVT::i64) return SDValue(); auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL, MVT::i32, MVT::i32); auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL, MVT::i32, MVT::i32); SDValue Lo = DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first); SDValue Hi = DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi), DAG.getConstant(0, DL, MVT::i32)); } return DAG.getNode(X86ISD::CMP, DL, MVT::i32, DAG.getBitcast(IntVT, MaskBits(LHS)), DAG.getBitcast(IntVT, MaskBits(RHS))); } // Without PTEST, a masked v2i64 or-reduction is not faster than // scalarization. bool UseKORTEST = Subtarget.useAVX512Regs(); bool UsePTEST = Subtarget.hasSSE41(); if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32) return SDValue(); // Split down to 128/256/512-bit vector. unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128); // If the input vector has vector elements wider than the target test size, // then cast to so it will safely split. if (ScalarSize > TestSize) { if (!Mask.isAllOnes()) return SDValue(); VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64); LHS = DAG.getBitcast(VT, LHS); RHS = DAG.getBitcast(VT, RHS); Mask = APInt::getAllOnes(64); } if (VT.getSizeInBits() > TestSize) { KnownBits KnownRHS = DAG.computeKnownBits(RHS); if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) { // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits. while (VT.getSizeInBits() > TestSize) { auto Split = DAG.SplitVector(LHS, DL); VT = Split.first.getValueType(); LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second); } RHS = DAG.getAllOnesConstant(DL, VT); } else if (!UsePTEST && !KnownRHS.isZero()) { // MOVMSK Special Case: // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....) MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8; VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits()); LHS = DAG.getBitcast(VT, MaskBits(LHS)); RHS = DAG.getBitcast(VT, MaskBits(RHS)); EVT BoolVT = VT.changeVectorElementType(MVT::i1); SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ); V = DAG.getSExtOrTrunc(V, DL, VT); while (VT.getSizeInBits() > TestSize) { auto Split = DAG.SplitVector(V, DL); VT = Split.first.getValueType(); V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second); } V = DAG.getNOT(DL, V, VT); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V, DAG.getConstant(0, DL, MVT::i32)); } else { // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern. SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); while (VT.getSizeInBits() > TestSize) { auto Split = DAG.SplitVector(V, DL); VT = Split.first.getValueType(); V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second); } LHS = V; RHS = DAG.getConstant(0, DL, VT); } } if (UseKORTEST && VT.is512BitVector()) { MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); MVT BoolVT = TestVT.changeVectorElementType(MVT::i1); LHS = DAG.getBitcast(TestVT, MaskBits(LHS)); RHS = DAG.getBitcast(TestVT, MaskBits(RHS)); SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE); return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V); } if (UsePTEST) { MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); LHS = DAG.getBitcast(TestVT, MaskBits(LHS)); RHS = DAG.getBitcast(TestVT, MaskBits(RHS)); SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V); } assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits"); MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8; LHS = DAG.getBitcast(MaskVT, MaskBits(LHS)); RHS = DAG.getBitcast(MaskVT, MaskBits(RHS)); SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS); V = DAG.getNOT(DL, V, MaskVT); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V, DAG.getConstant(0, DL, MVT::i32)); } // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback // to CMP(MOVMSK(PCMPEQB(X,Y))). static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); bool CmpNull = isNullConstant(RHS); bool CmpAllOnes = isAllOnesConstant(RHS); if (!CmpNull && !CmpAllOnes) return SDValue(); SDValue Op = LHS; if (!Subtarget.hasSSE2() || !Op->hasOneUse()) return SDValue(); // Check whether we're masking/truncating an OR-reduction result, in which // case track the masked bits. // TODO: Add CmpAllOnes support. APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits()); if (CmpNull) { switch (Op.getOpcode()) { case ISD::TRUNCATE: { SDValue Src = Op.getOperand(0); Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(), Op.getScalarValueSizeInBits()); Op = Src; break; } case ISD::AND: { if (auto *Cst = dyn_cast(Op.getOperand(1))) { Mask = Cst->getAPIntValue(); Op = Op.getOperand(0); } break; } } } ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND; // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns. // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns. SmallVector VecIns; if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) { EVT VT = VecIns[0].getValueType(); assert(llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"); // Quit if not splittable to scalar/128/256/512-bit vector. if (!llvm::has_single_bit(VT.getSizeInBits())) return SDValue(); // If more than one full vector is evaluated, AND/OR them first before // PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will AND/OR 2 nodes and append the result until there is // only 1 node left, i.e. the final value of all vectors. SDValue LHS = VecIns[Slot]; SDValue RHS = VecIns[Slot + 1]; VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS)); } return LowerVectorAllEqual(DL, VecIns.back(), CmpNull ? DAG.getConstant(0, DL, VT) : DAG.getAllOnesConstant(DL, VT), CC, Mask, Subtarget, DAG, X86CC); } // Match icmp(reduce_or(X),0) anyof reduction patterns. // Match icmp(reduce_and(X),-1) allof reduction patterns. if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ISD::NodeType BinOp; if (SDValue Match = DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) { EVT MatchVT = Match.getValueType(); return LowerVectorAllEqual(DL, Match, CmpNull ? DAG.getConstant(0, DL, MatchVT) : DAG.getAllOnesConstant(DL, MatchVT), CC, Mask, Subtarget, DAG, X86CC); } } if (Mask.isAllOnes()) { assert(!Op.getValueType().isVector() && "Illegal vector type for reduction pattern"); SDValue Src = peekThroughBitcasts(Op); if (Src.getValueType().isFixedLengthVector() && Src.getValueType().getScalarType() == MVT::i1) { // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns. // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns. if (Src.getOpcode() == ISD::SETCC) { SDValue LHS = Src.getOperand(0); SDValue RHS = Src.getOperand(1); EVT LHSVT = LHS.getValueType(); ISD::CondCode SrcCC = cast(Src.getOperand(2))->get(); if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) && llvm::has_single_bit(LHSVT.getSizeInBits())) { APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits()); return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG, X86CC); } } // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns. // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns. // Peek through truncation, mask the LSB and compare against zero/LSB. if (Src.getOpcode() == ISD::TRUNCATE) { SDValue Inner = Src.getOperand(0); EVT InnerVT = Inner.getValueType(); if (llvm::has_single_bit(InnerVT.getSizeInBits())) { unsigned BW = InnerVT.getScalarSizeInBits(); APInt SrcMask = APInt(BW, 1); APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask; return LowerVectorAllEqual(DL, Inner, DAG.getConstant(Cmp, DL, InnerVT), CC, SrcMask, Subtarget, DAG, X86CC); } } } } return SDValue(); } /// return true if \c Op has a use that doesn't just read flags. static bool hasNonFlagsUse(SDValue Op) { for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; ++UI) { SDNode *User = *UI; unsigned UOpNo = UI.getOperandNo(); if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { // Look pass truncate. UOpNo = User->use_begin().getOperandNo(); User = *User->use_begin(); } if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) return true; } return false; } // Transform to an x86-specific ALU node with flags if there is a chance of // using an RMW op or only the flags are used. Otherwise, leave // the node alone and emit a 'cmp' or 'test' instruction. static bool isProfitableToUseFlagOp(SDValue Op) { for (SDNode *U : Op->uses()) if (U->getOpcode() != ISD::CopyToReg && U->getOpcode() != ISD::SETCC && U->getOpcode() != ISD::STORE) return false; return true; } /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // CF and OF aren't always set the way we want. Determine which // of these we need. bool NeedCF = false; bool NeedOF = false; switch (X86CC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: NeedCF = true; break; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: { // Check if we really need to set the // Overflow flag. If NoSignedWrap is present // that is not actually needed. switch (Op->getOpcode()) { case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::SHL: if (Op.getNode()->getFlags().hasNoSignedWrap()) break; [[fallthrough]]; default: NeedOF = true; break; } break; } } // See if we can use the EFLAGS value from the operand instead of // doing a separate TEST. TEST always sets OF and CF to 0, so unless // we prove that the arithmetic won't overflow, we can't use OF or CF. if (Op.getResNo() != 0 || NeedOF || NeedCF) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } unsigned Opcode = 0; unsigned NumOperands = 0; SDValue ArithOp = Op; // NOTICE: In the code below we use ArithOp to hold the arithmetic operation // which may be the result of a CAST. We use the variable 'Op', which is the // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::AND: // If the primary 'and' result isn't used, don't bother using X86ISD::AND, // because a TEST instruction will be better. if (!hasNonFlagsUse(Op)) break; [[fallthrough]]; case ISD::ADD: case ISD::SUB: case ISD::OR: case ISD::XOR: if (!isProfitableToUseFlagOp(Op)) break; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { // clang-format off default: llvm_unreachable("unexpected operator!"); case ISD::ADD: Opcode = X86ISD::ADD; break; case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; case ISD::OR: Opcode = X86ISD::OR; break; // clang-format on } NumOperands = 2; break; case X86ISD::ADD: case X86ISD::SUB: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); case ISD::SSUBO: case ISD::USUBO: { // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), Op->getOperand(1)).getValue(1); } default: break; } if (Opcode == 0) { // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, Op.getValueType())); } SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector Ops(Op->op_begin(), Op->op_begin() + NumOperands); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New); return SDValue(New.getNode(), 1); } /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (isNullConstant(Op1)) return EmitTest(Op0, X86CC, dl, DAG, Subtarget); EVT CmpVT = Op0.getValueType(); assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); // Only promote the compare up to I32 if it is a 16 bit operation // with an immediate. 16 bit immediates are to be avoided unless the target // isn't slowed down by length changing prefixes, we're optimizing for // codesize or the comparison is with a folded load. if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() && !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) && !DAG.getMachineFunction().getFunction().hasMinSize()) { auto *COp0 = dyn_cast(Op0); auto *COp1 = dyn_cast(Op1); // Don't do this if the immediate can fit in 8-bits. if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { // For equality comparisons try to use SIGN_EXTEND if the input was // truncate from something with enough sign bits. if (Op0.getOpcode() == ISD::TRUNCATE) { if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16) ExtendOp = ISD::SIGN_EXTEND; } else if (Op1.getOpcode() == ISD::TRUNCATE) { if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16) ExtendOp = ISD::SIGN_EXTEND; } } CmpVT = MVT::i32; Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } } // Try to shrink i64 compares if the input has enough zero bits. // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)? if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) && Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) && DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) { CmpVT = MVT::i32; Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); } // 0-x == y --> x+y == 0 // 0-x != y --> x+y != 0 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) && Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1); return Add.getValue(1); } // x == 0-y --> x+y == 0 // x != 0-y --> x+y != 0 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) && Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1)); return Add.getValue(1); } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return Sub.getValue(1); } bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const { return !VT.isVector() || Cond != ISD::CondCode::SETEQ; } bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast( SDNode *N, SDValue, SDValue IntPow2) const { if (N->getOpcode() == ISD::FDIV) return true; EVT FPVT = N->getValueType(0); EVT IntVT = IntPow2.getValueType(); // This indicates a non-free bitcast. // TODO: This is probably overly conservative as we will need to scale the // integer vector anyways for the int->fp cast. if (FPVT.isVector() && FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits()) return false; return true; } /// Check if replacement of SQRT with RSQRT should be disabled. bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // We don't need to replace SQRT with RSQRT for half type. if (VT.getScalarType() == MVT::f16) return true; // We never want to use both SQRT and RSQRT instructions for the same input. if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) return false; if (VT.isVector()) return Subtarget.hasFastVectorFSQRT(); return Subtarget.hasFastScalarFSQRT(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { SDLoc DL(Op); EVT VT = Op.getValueType(); // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. // It is likely not profitable to do this for f64 because a double-precision // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 // after legalize types. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || (VT == MVT::v8f32 && Subtarget.hasAVX()) || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; UseOneConstNR = false; // There is no FSQRT for 512-bits, but there is RSQRT14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op); if (RefinementSteps == 0 && !Reciprocal) Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate); return Estimate; } if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && Subtarget.hasFP16()) { assert(Reciprocal && "Don't replace SQRT with RSQRT for half type"); if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 0; if (VT == MVT::f16) { SDValue Zero = DAG.getIntPtrConstant(0, DL); SDValue Undef = DAG.getUNDEF(MVT::v8f16); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op); Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero); } return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op); } return SDValue(); } /// The minimum architected relative accuracy is 2^-12. We need one /// Newton-Raphson step to have a good float result (24 bits of precision). SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const { SDLoc DL(Op); EVT VT = Op.getValueType(); // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. // It is likely not profitable to do this for f64 because a double-precision // reciprocal estimate with refinement on x86 prior to FMA requires // 15 instructions: convert to single, rcpss, convert back to double, refine // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA // along with FMA, this could be a throughput win. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::v4f32 && Subtarget.hasSSE1()) || (VT == MVT::v8f32 && Subtarget.hasAVX()) || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { // Enable estimate codegen with 1 refinement step for vector division. // Scalar division estimates are disabled because they break too much // real-world code. These defaults are intended to match GCC behavior. if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) return SDValue(); if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; // There is no FSQRT for 512-bits, but there is RCP14. unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; return DAG.getNode(Opcode, DL, VT, Op); } if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && Subtarget.hasFP16()) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 0; if (VT == MVT::f16) { SDValue Zero = DAG.getIntPtrConstant(0, DL); SDValue Undef = DAG.getUNDEF(MVT::v8f16); Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op); Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero); } return DAG.getNode(X86ISD::RCP14, DL, VT, Op); } return SDValue(); } /// If we have at least two divisions that use the same divisor, convert to /// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } SDValue X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && "Unexpected divisor!"); // Only perform this transform if CMOV is supported otherwise the select // below will become a branch. if (!Subtarget.canUseCMOV()) return SDValue(); // fold (sdiv X, pow2) EVT VT = N->getValueType(0); // FIXME: Support i8. if (VT != MVT::i16 && VT != MVT::i32 && !(Subtarget.is64Bit() && VT == MVT::i64)) return SDValue(); // If the divisor is 2 or -2, the default expansion is better. if (Divisor == 2 || Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true)) return SDValue(); return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created); } /// Result of 'and' is compared against zero. Change to a BT node if possible. /// Returns the BT node and the condition code needed to use it. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC) { assert(And.getOpcode() == ISD::AND && "Expected AND node!"); SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::TRUNCATE) Op1 = Op1.getOperand(0); SDValue Src, BitNo; if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); unsigned AndBitWidth = And.getValueSizeInBits(); if (BitWidth > AndBitWidth) { KnownBits Known = DAG.computeKnownBits(Op0); if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } Src = Op1; BitNo = Op0.getOperand(1); } } else if (Op1.getOpcode() == ISD::Constant) { ConstantSDNode *AndRHS = cast(Op1); uint64_t AndRHSVal = AndRHS->getZExtValue(); SDValue AndLHS = Op0; if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { Src = AndLHS.getOperand(0); BitNo = AndLHS.getOperand(1); } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. bool OptForSize = DAG.shouldOptForSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType()); } } } // No patterns found, give up. if (!Src.getNode()) return SDValue(); // Remove any bit flip. if (isBitwiseNot(Src)) { Src = Src.getOperand(0); CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ; } // Attempt to create the X86ISD::BT node. if (SDValue BT = getBT(Src, BitNo, dl, DAG)) { X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; return BT; } return SDValue(); } // Check if pre-AVX condcode can be performed by a single FCMP op. static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) { return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ); } /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling) { unsigned SSECC; bool Swap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (SetCCOpcode) { // clang-format off default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETOEQ: case ISD::SETEQ: SSECC = 0; break; case ISD::SETOGT: case ISD::SETGT: Swap = true; [[fallthrough]]; case ISD::SETLT: case ISD::SETOLT: SSECC = 1; break; case ISD::SETOGE: case ISD::SETGE: Swap = true; [[fallthrough]]; case ISD::SETLE: case ISD::SETOLE: SSECC = 2; break; case ISD::SETUO: SSECC = 3; break; case ISD::SETUNE: case ISD::SETNE: SSECC = 4; break; case ISD::SETULE: Swap = true; [[fallthrough]]; case ISD::SETUGE: SSECC = 5; break; case ISD::SETULT: Swap = true; [[fallthrough]]; case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; case ISD::SETUEQ: SSECC = 8; break; case ISD::SETONE: SSECC = 12; break; // clang-format on } if (Swap) std::swap(Op0, Op1); switch (SetCCOpcode) { default: IsAlwaysSignaling = true; break; case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ: case ISD::SETNE: case ISD::SETONE: case ISD::SETUNE: case ISD::SETO: case ISD::SETUO: IsAlwaysSignaling = false; break; } return SSECC; } /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then /// concatenate the result back. static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl) { assert(VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && "Unsupported VTs!"); SDValue CC = DAG.getCondCode(Cond); // Extract the LHS Lo/Hi vectors SDValue LHS1, LHS2; std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl); // Extract the RHS Lo/Hi vectors SDValue RHS1, RHS2; std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl); // Issue the operation on the smaller types and concatenate the result back EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC), DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC)); } static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); MVT VT = Op.getSimpleValueType(); assert(VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast(CC)->get(); // Prefer SETGT over SETLT. if (SetCCOpcode == ISD::SETLT) { SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); std::swap(Op0, Op1); } return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } /// Given a buildvector constant, return a new vector constant with each element /// incremented or decremented. If incrementing or decrementing would result in /// unsigned overflow or underflow or this is not a simple vector constant, /// return an empty value. static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW) { auto *BV = dyn_cast(V.getNode()); if (!BV || !V.getValueType().isSimple()) return SDValue(); MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); SmallVector NewVecC; SDLoc DL(V); for (unsigned i = 0; i < NumElts; ++i) { auto *Elt = dyn_cast(BV->getOperand(i)); if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); // Avoid overflow/underflow. const APInt &EltC = Elt->getAPIntValue(); if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero())) return SDValue(); if (NSW && ((IsInc && EltC.isMaxSignedValue()) || (!IsInc && EltC.isMinSignedValue()))) return SDValue(); NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); } /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for /// Op0 u<= Op1: /// t = psubus Op0, Op1 /// pcmpeq t, <0..0> static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (!Subtarget.hasSSE2()) return SDValue(); MVT VET = VT.getVectorElementType(); if (VET != MVT::i8 && VET != MVT::i16) return SDValue(); switch (Cond) { default: return SDValue(); case ISD::SETULT: { // If the comparison is against a constant we can turn this into a // setule. With psubus, setule does not require a swap. This is // beneficial because the constant in the register is no longer // destructed as the destination so it can be hoisted out of a loop. // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } case ISD::SETUGT: { // If the comparison is against a constant, we can turn this into a setuge. // This is beneficial because materializing a constant 0 for the PCMPEQ is // probably cheaper than XOR+PCMPGT using 2 different vector constants: // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false); if (!UGEOp1) return SDValue(); Op1 = Op0; Op0 = UGEOp1; break; } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); break; case ISD::SETULE: break; } SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1); return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, DAG.getConstant(0, dl, VT)); } static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); SDValue CC = Op.getOperand(IsStrict ? 3 : 2); MVT VT = Op->getSimpleValueType(0); ISD::CondCode Cond = cast(CC)->get(); bool isFP = Op1.getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); if (isSoftF16(EltVT, Subtarget)) return SDValue(); bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); // If we have a strict compare with a vXi1 result and the input is 128/256 // bits we can't use a masked compare unless we have VLX. If we use a wider // compare like we do for non-strict, we might trigger spurious exceptions // from the upper elements. Instead emit a AVX compare and convert to mask. unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 && (!IsStrict || Subtarget.hasVLX() || Op0.getSimpleValueType().is512BitVector())) { #ifndef NDEBUG unsigned Num = VT.getVectorNumElements(); assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16)); #endif Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP; // The SSE/AVX packed FP comparison nodes are defined with a // floating-point vector result that matches the operand type. This allows // them to work with an SSE1 target (integer vector types are not legal). VT = Op0.getSimpleValueType(); } SDValue Cmp; bool IsAlwaysSignaling; unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling); if (!Subtarget.hasAVX()) { // TODO: We could use following steps to handle a quiet compare with // signaling encodings. // 1. Get ordered masks from a quiet ISD::SETO // 2. Use the masks to mask potential unordered elements in operand A, B // 3. Get the compare results of masked A, B // 4. Calculating final result using the mask and result from 3 // But currently, we just fall back to scalar operations. if (IsStrict && IsAlwaysSignaling && !IsSignaling) return SDValue(); // Insert an extra signaling instruction to raise exception. if (IsStrict && !IsAlwaysSignaling && IsSignaling) { SDValue SignalCmp = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS // FIXME: It seems we need to update the flags of all new strict nodes. // Otherwise, mayRaiseFPException in MI will return false due to // NoFPExcept = false by default. However, I didn't find it in other // patches. SignalCmp->setFlags(Op->getFlags()); Chain = SignalCmp.getValue(1); } // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), // emit two comparisons and a logic op to tie them together. if (!cheapX86FSETCC_SSE(Cond)) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ CombineOpc = X86ISD::FOR; } else { assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ CombineOpc = X86ISD::FAND; } SDValue Cmp0, Cmp1; if (IsStrict) { Cmp0 = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)}); Cmp1 = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)}); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1), Cmp1.getValue(1)); } else { Cmp0 = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)); Cmp1 = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)); } Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { if (IsStrict) { Cmp = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); Chain = Cmp.getValue(1); } else Cmp = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } } else { // Handle all other FP comparisons here. if (IsStrict) { // Make a flip on already signaling CCs before setting bit 4 of AVX CC. SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4; Cmp = DAG.getNode( Opc, dl, {VT, MVT::Other}, {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); Chain = Cmp.getValue(1); } else Cmp = DAG.getNode( Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } if (VT.getFixedSizeInBits() > Op.getSimpleValueType().getFixedSizeInBits()) { // We emitted a compare with an XMM/YMM result. Finish converting to a // mask register using a vptestm. EVT CastVT = EVT(VT).changeVectorElementTypeToInteger(); Cmp = DAG.getBitcast(CastVT, Cmp); Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp, DAG.getConstant(0, dl, CastVT), ISD::SETNE); } else { // If this is SSE/AVX CMPP, bitcast the result back to integer to match // the result type of SETCC. The bitcast is expected to be optimized // away during combining/isel. Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); } if (IsStrict) return DAG.getMergeValues({Cmp, Chain}, dl); return Cmp; } assert(!IsStrict && "Strict SETCC only handles FP operands."); [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType(); assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"); // The result is boolean, but operands are int/float if (VT.getVectorElementType() == MVT::i1) { // In AVX-512 architecture setcc returns mask with i1 elements, // But there is no compare instruction for i8 and i16 elements in KNL. assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && "Unexpected operand type"); return LowerIntVSETCC_AVX512(Op, dl, DAG); } // Lower using XOP integer comparisons. if (VT.is128BitVector() && Subtarget.hasXOP()) { // Translate compare code to XOP PCOM compare mode. unsigned CmpMode = 0; switch (Cond) { // clang-format off default: llvm_unreachable("Unexpected SETCC condition"); case ISD::SETULT: case ISD::SETLT: CmpMode = 0x00; break; case ISD::SETULE: case ISD::SETLE: CmpMode = 0x01; break; case ISD::SETUGT: case ISD::SETGT: CmpMode = 0x02; break; case ISD::SETUGE: case ISD::SETGE: CmpMode = 0x03; break; case ISD::SETEQ: CmpMode = 0x04; break; case ISD::SETNE: CmpMode = 0x05; break; // clang-format on } // Are we comparing unsigned or signed integers? unsigned Opc = ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CmpMode, dl, MVT::i8)); } // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { SDValue BC0 = peekThroughBitcasts(Op0); if (BC0.getOpcode() == ISD::AND) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode( BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits, /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) { if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { Cond = ISD::SETEQ; Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); } } } } // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { ConstantSDNode *C1 = isConstOrConstSplat(Op1); if (C1 && C1->getAPIntValue().isPowerOf2()) { unsigned BitWidth = VT.getScalarSizeInBits(); unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; SDValue Result = Op0.getOperand(0); Result = DAG.getNode(ISD::SHL, dl, VT, Result, DAG.getConstant(ShiftAmt, dl, VT)); Result = DAG.getNode(ISD::SRA, dl, VT, Result, DAG.getConstant(BitWidth - 1, dl, VT)); return Result; } } // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); // Break 512-bit integer vector compare into smaller ones. // TODO: Try harder to use VPCMPx + VPMOV2x? if (VT.is512BitVector()) return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid // not-of-PCMPEQ: // X != INT_MIN --> X >s INT_MIN // X != INT_MAX --> X INT_MAX >s X // +X != 0 --> +X >s 0 APInt ConstValue; if (Cond == ISD::SETNE && ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) { if (ConstValue.isMinSignedValue()) Cond = ISD::SETGT; else if (ConstValue.isMaxSignedValue()) Cond = ISD::SETLT; else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0)) Cond = ISD::SETGT; } // If both operands are known non-negative, then an unsigned compare is the // same as a signed compare and there's no need to flip signbits. // TODO: We could check for more general simplifications here since we're // computing known bits. bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); // Special case: Use min/max operations for unsigned compares. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ISD::isUnsignedIntSetCC(Cond) && (FlipSigns || ISD::isTrueWhenEqual(Cond)) && TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. if (Cond == ISD::SETUGT) { // X > C --> X >= (C+1) --> X == umax(X, C+1) if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) { Op1 = UGTOp1; Cond = ISD::SETUGE; } } if (Cond == ISD::SETULT) { // X < C --> X <= (C-1) --> X == umin(X, C-1) if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) { Op1 = ULTOp1; Cond = ISD::SETULE; } } bool Invert = false; unsigned Opc; switch (Cond) { // clang-format off default: llvm_unreachable("Unexpected condition code"); case ISD::SETUGT: Invert = true; [[fallthrough]]; case ISD::SETULE: Opc = ISD::UMIN; break; case ISD::SETULT: Invert = true; [[fallthrough]]; case ISD::SETUGE: Opc = ISD::UMAX; break; // clang-format on } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } // Try to use SUBUS and PCMPEQ. if (FlipSigns) if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG)) return V; // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ : X86ISD::PCMPGT; bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE || Cond == ISD::SETUGE; bool Invert = Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); if (Swap) std::swap(Op0, Op1); // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { assert(Subtarget.hasSSE2() && "Don't know how to lower!"); // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle // the odd elements over the even elements. if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) { Op0 = DAG.getConstant(0, dl, MVT::v4i32); Op1 = DAG.getBitcast(MVT::v4i32, Op1); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); static const int MaskHi[] = { 1, 1, 3, 3 }; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); return DAG.getBitcast(VT, Result); } if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) { Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getConstant(-1, dl, MVT::v4i32); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); static const int MaskHi[] = { 1, 1, 3, 3 }; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); return DAG.getBitcast(VT, Result); } // If the i64 elements are sign-extended enough to be representable as i32 // then we can compare the lower i32 bits and splat. if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 && DAG.ComputeNumSignBits(Op1) > 32) { Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); static const int MaskLo[] = {0, 0, 2, 2}; SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); return DAG.getBitcast(VT, Result); } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL : 0x0000000080000000ULL, dl, MVT::v2i64); Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB); Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB); // Cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); // Create masks for only the low parts/high parts of the 64 bit integers. static const int MaskHi[] = { 1, 1, 3, 3 }; static const int MaskLo[] = { 0, 0, 2, 2 }; SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with // pcmpeqd + pshufd + pand. assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"); // First cast everything to the right type. Op0 = DAG.getBitcast(MVT::v4i32, Op0); Op1 = DAG.getBitcast(MVT::v4i32, Op1); // Do the compare. SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); // Make sure the lower and upper halves are both all-ones. static const int Mask[] = { 1, 0, 3, 2 }; SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); if (Invert) Result = DAG.getNOT(dl, Result, MVT::v4i32); return DAG.getBitcast(VT, Result); } } // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { MVT EltVT = VT.getVectorElementType(); SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); } SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible. static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); // Must be a bitcast from vXi1. if (Op0.getOpcode() != ISD::BITCAST) return SDValue(); Op0 = Op0.getOperand(0); MVT VT = Op0.getSimpleValueType(); if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) && !(Subtarget.hasDQI() && VT == MVT::v8i1) && !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) return SDValue(); X86::CondCode X86Cond; if (isNullConstant(Op1)) { X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; } else if (isAllOnesConstant(Op1)) { // C flag is set for all ones. X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE; } else return SDValue(); // If the input is an AND, we can combine it's operands into the KTEST. bool KTestable = false; if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) KTestable = true; if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)) KTestable = true; if (!isNullConstant(Op1)) KTestable = false; if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) { SDValue LHS = Op0.getOperand(0); SDValue RHS = Op0.getOperand(1); X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS); } // If the input is an OR, we can combine it's operands into the KORTEST. SDValue LHS = Op0; SDValue RHS = Op0; if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) { LHS = Op0.getOperand(0); RHS = Op0.getOperand(1); } X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); } /// Emit flags for the given setcc condition and operands. Also returns the /// corresponding X86 condition code constant in X86CC. SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, SDValue &X86CC) const { // Equality Combines. if (CC == ISD::SETEQ || CC == ISD::SETNE) { X86::CondCode X86CondCode; // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) { if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) { X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); return BT; } } // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0. if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG, X86CondCode)) { X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); return CmpZ; } // Try to lower using KORTEST or KTEST. if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) return Test; // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms // of these. if (isOneConstant(Op1) || isNullConstant(Op1)) { // If the input is a setcc, then reuse the input setcc or use a new one // with the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); X86CC = Op0.getOperand(0); if (Invert) { X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0); X86CondCode = X86::GetOppositeBranchCondition(X86CondCode); X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); } return Op0.getOperand(1); } } // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for // overflow. if (isMinSignedConstant(Op1)) { EVT VT = Op0.getValueType(); if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) { SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32); X86::CondCode CondCode = CC == ISD::SETEQ ? X86::COND_O : X86::COND_NO; X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs, DAG.getConstant(0, dl, VT), Op0); return SDValue(Neg.getNode(), 1); } } // Try to use the carry flag from the add in place of an separate CMP for: // (seteq (add X, -1), -1). Similar for setne. if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD && Op0.getOperand(1) == Op1) { if (isProfitableToUseFlagOp(Op0)) { SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0), Op0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New); X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); return SDValue(New.getNode(), 1); } } } X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG); assert(CondCode != X86::COND_INVALID && "Unexpected condition code!"); SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget); X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; } SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || Op.getOpcode() == ISD::STRICT_FSETCCS; MVT VT = Op->getSimpleValueType(0); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(IsStrict ? 3 : 2))->get(); if (isSoftF16(Op0.getValueType(), Subtarget)) return SDValue(); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain, Op.getOpcode() == ISD::STRICT_FSETCCS); // If softenSetCCOperands returned a scalar, use it. if (!Op1.getNode()) { assert(Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); if (IsStrict) return DAG.getMergeValues({Op0, Chain}, dl); return Op0; } } if (Op0.getSimpleValueType().isInteger()) { // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF), // this may translate to less uops depending on uarch implementation. The // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already // canonicalize to that CondCode. // NOTE: Only do this if incrementing the constant doesn't increase the bit // encoding size - so it must either already be a i8 or i32 immediate, or it // shrinks down to that. We don't do this for any i64's to avoid additional // constant materializations. // TODO: Can we move this to TranslateX86CC to handle jumps/branches too? if (auto *Op1C = dyn_cast(Op1)) { const APInt &Op1Val = Op1C->getAPIntValue(); if (!Op1Val.isZero()) { // Ensure the constant+1 doesn't overflow. if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) || (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) { APInt Op1ValPlusOne = Op1Val + 1; if (Op1ValPlusOne.isSignedIntN(32) && (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) { Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType()); CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE : ISD::CondCode::SETUGE; } } } } SDValue X86CC; SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } // Handle floating point. X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG); if (CondCode == X86::COND_INVALID) return SDValue(); SDValue EFLAGS; if (IsStrict) { bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; EFLAGS = DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); Chain = EFLAGS.getValue(1); } else { EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1); } SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); X86::CondCode CC = TranslateIntegerX86CC(cast(Cond)->get()); // Recreate the carry if needed. EVT CarryVT = Carry.getValueType(); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getAllOnesConstant(DL, CarryVT)); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); return getSETCC(CC, Cmp.getValue(1), DL, DAG); } // This function returns three things: the arithmetic computation itself // (Value), an EFLAGS result (Overflow), and a condition code (Cond). The // flag and the condition code define the case in which the arithmetic // computation overflows. static std::pair getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { assert(Op.getResNo() == 0 && "Unexpected result number!"); SDValue Value, Overflow; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned BaseOp = 0; SDLoc DL(Op); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; case ISD::UADDO: BaseOp = X86ISD::ADD; Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; break; case ISD::SSUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; case ISD::USUBO: BaseOp = X86ISD::SUB; Cond = X86::COND_B; break; case ISD::SMULO: BaseOp = X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: BaseOp = X86ISD::UMUL; Cond = X86::COND_O; break; } if (BaseOp) { // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" // has only one use. SDLoc DL(Op); X86::CondCode Cond; SDValue Value, Overflow; std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!"); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); } /// Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || Opc == X86ISD::FCMP) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; return false; } static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { if (V.getOpcode() != ISD::TRUNCATE) return false; SDValue VOp0 = V.getOperand(0); unsigned InBits = VOp0.getValueSizeInBits(); unsigned Bits = V.getValueSizeInBits(); return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); } SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool AddTest = true; SDValue Cond = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); MVT VT = Op1.getSimpleValueType(); SDValue CC; if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeTypeToInteger(); return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond, DAG.getBitcast(NVT, Op1), DAG.getBitcast(NVT, Op2))); } // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); bool IsAlwaysSignaling; unsigned SSECC = translateX86FSETCC(cast(Cond.getOperand(2))->get(), CondOp0, CondOp1, IsAlwaysSignaling); if (Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, DAG.getTargetConstant(SSECC, DL, MVT::i8)); assert(!VT.isVector() && "Not a scalar type?"); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getTargetConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. // If either operand is a +0.0 constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many // instructions as the AND/ANDN/OR sequence due to register moves, so // don't bother. if (Subtarget.hasAVX() && !isNullFPConstant(Op1) && !isNullFPConstant(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); } SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); } } // AVX512 fallback is to lower selects of scalar floats to masked moves. if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (Cond.getOpcode() == ISD::SETCC && !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; // If the condition was updated, it's possible that the operands of the // select were also updated (for example, EmitTest has a RAUW). Refresh // the local references to the select operands in case they got stale. Op1 = Op.getOperand(1); Op2 = Op.getOperand(2); } } // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); SDValue CmpOp0 = Cmp.getOperand(0); unsigned CondCode = Cond.getConstantOperandVal(0); // Special handling for __builtin_ffs(X) - 1 pattern which looks like // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special // handle to keep the CMP with 0. This should be removed by // optimizeCompareInst by using the flags from the BSR/TZCNT used for the // cttz_zero_undef. auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) { return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); }; if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) && ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) || (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) { // Keep Cmp. } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); // 'X - 1' sets the carry flag if X == 0. // '0 - X' sets the carry flag if X != 0. // Convert the carry flag to a -1/0 mask with sbb: // select (X != 0), -1, Y --> 0 - X; or (sbb), Y // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y // select (X != 0), Y, -1 --> X - 1; or (sbb), Y // select (X == 0), -1, Y --> X - 1; or (sbb), Y SDValue Sub; if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0); } else { SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType()); Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One); } SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Sub.getValue(1)); return DAG.getNode(ISD::OR, DL, VT, SBB, Y); } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E && CmpOp0.getOpcode() == ISD::AND && isOneConstant(CmpOp0.getOperand(1))) { SDValue Src1, Src2; // true if Op2 is XOR or OR operator and one of its operands // is equal to Op1 // ( a , a op b) || ( b , a op b) auto isOrXorPattern = [&]() { if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { Src1 = Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); Src2 = Op1; return true; } return false; }; if (isOrXorPattern()) { SDValue Neg; unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); // we need mask of all zeros or ones with same size of the other // operands. if (CmpSz > VT.getSizeInBits()) Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); else if (CmpSz < VT.getSizeInBits()) Neg = DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), DAG.getConstant(1, DL, VT)); else Neg = CmpOp0; SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1)) SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y } } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) && Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) && ((CondCode == X86::COND_S) || // smin(x, 0) (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0) // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x // // If the comparison is testing for a positive value, we have to invert // the sign bit mask, so only do that transform if the target has a // bitwise 'and not' instruction (the invert is free). // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x unsigned ShCt = VT.getSizeInBits() - 1; SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT); SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt); if (CondCode == X86::COND_G) Shift = DAG.getNOT(DL, Shift, VT); return DAG.getNode(ISD::AND, DL, VT, Shift, Op1); } } // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && isOneConstant(Cond.getOperand(1))) Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. unsigned CondOpcode = Cond.getOpcode(); if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack? IllegalFPCMov = !hasFPCMov(cast(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || Cmp.getOpcode() == X86ISD::BT) { // FIXME Cond = Cmp; AddTest = false; } } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { SDValue Value; X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); AddTest = false; } if (AddTest) { // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { X86::CondCode X86CondCode; if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) { CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8); Cond = BT; AddTest = false; } } } if (AddTest) { CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); } // a < b ? -1 : 0 -> RES = ~setcc_carry // a < b ? 0 : -1 -> RES = setcc_carry // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { unsigned CondCode = CC->getAsZExtVal(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } } // X86 doesn't have an i8 cmov. If both operands are the result of a truncate // widen the cmov and push the truncate through. This avoids introducing a new // branch during isel and doesn't add any extensions. if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && // Exclude CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // Or finally, promote i8 cmovs if we have CMOV, // or i16 cmovs if it won't prevent folding a load. // FIXME: we should not limit promotion of i8 case to only when the CMOV is // legal, but EmitLoweredSelect() can not deal with these extensions // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) || (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) && !X86::mayFoldLoad(Op2, Subtarget))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDValue Ops[] = { Op2, Op1, CC, Cond }; return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags()); } static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); MVT VTElt = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); // Extend VT if the scalar type is i8/i16 and BWI is not supported. MVT ExtVT = VT; if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) { // If v16i32 is to be avoided, we'll need to split and concatenate. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG); ExtVT = MVT::getVectorVT(MVT::i32, NumElts); } // Widen to 512-bits if VLX is not supported. MVT WideVT = ExtVT; if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { NumElts *= 512 / ExtVT.getSizeInBits(); InVT = MVT::getVectorVT(MVT::i1, NumElts); In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In, DAG.getIntPtrConstant(0, dl)); WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); } SDValue V; MVT WideEltVT = WideVT.getVectorElementType(); if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { V = DAG.getNode(Op.getOpcode(), dl, WideVT, In); } else { SDValue NegOne = DAG.getConstant(-1, dl, WideVT); SDValue Zero = DAG.getConstant(0, dl, WideVT); V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } // Truncate if we had to extend i16/i8 above. if (VT != ExtVT) { WideVT = MVT::getVectorVT(VTElt, NumElts); V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V); } // Extract back to 128/256-bit if we widened. if (WideVT != VT) V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, DAG.getIntPtrConstant(0, dl)); return V; } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc DL(Op); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG); assert(Subtarget.hasAVX() && "Expected AVX support"); return LowerAVXExtend(Op, DL, DAG, Subtarget); } // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. // For sign extend this needs to handle all vector sizes and SSE4.1 and // non-SSE4.1 targets. For zero extend this should only handle inputs of // MVT::v64i8 when BWI is not supported, but AVX512 is. static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) return SDValue(); if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) return SDValue(); if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && !(VT.is256BitVector() && Subtarget.hasAVX()) && !(VT.is512BitVector() && Subtarget.hasAVX512())) return SDValue(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. if (InVT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); InVT = In.getSimpleValueType(); } // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); if (InVT.getVectorNumElements() != NumElts) return DAG.getNode(Op.getOpcode(), dl, VT, In); // FIXME: Apparently we create inreg operations that could be regular // extends. unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. if (Subtarget.hasAVX()) { assert(VT.is256BitVector() && "256-bit vector expected"); MVT HalfVT = VT.getHalfNumVectorElementsVT(); int HalfNumElts = HalfVT.getVectorNumElements(); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector HiMask(NumSrcElts, SM_SentinelUndef); for (int i = 0; i != HalfNumElts; ++i) HiMask[i] = HalfNumElts + i; SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In); SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask); Hi = DAG.getNode(Opc, dl, HalfVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } // We should only get here for sign extend. assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"); assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs"); unsigned InNumElts = InVT.getVectorNumElements(); // If the source elements are already all-signbits, we don't need to extend, // just splat the elements. APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts); if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) { unsigned Scale = InNumElts / NumElts; SmallVector ShuffleMask; for (unsigned I = 0; I != NumElts; ++I) ShuffleMask.append(Scale, I); return DAG.getBitcast(VT, DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask)); } // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. SDValue Curr = In; SDValue SignExt = Curr; // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. if (InVT != MVT::v4i32) { MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT; unsigned DestWidth = DestVT.getScalarSizeInBits(); unsigned Scale = DestWidth / InSVT.getSizeInBits(); unsigned DestElts = DestVT.getVectorNumElements(); // Build a shuffle mask that takes each input element and places it in the // MSBs of the new element size. SmallVector Mask(InNumElts, SM_SentinelUndef); for (unsigned i = 0; i != DestElts; ++i) Mask[i * Scale + (Scale - 1)] = i; Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask); Curr = DAG.getBitcast(DestVT, Curr); unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, DAG.getTargetConstant(SignExtShift, dl, MVT::i8)); } if (VT == MVT::v2i64) { assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"); SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT); SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5}); SignExt = DAG.getBitcast(VT, SignExt); } return SignExt; } static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); if (InVT.getVectorElementType() == MVT::i1) return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && "Unexpected element type"); assert((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { assert(InVT == MVT::v32i8 && "Unexpected VT!"); return splitVectorIntUnary(Op, DAG, dl); } if (Subtarget.hasInt256()) return Op; // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and // v4i32 to v4i64 // // Divide input vector into two parts // for v4i32 the high shuffle mask will be {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); unsigned NumElems = InVT.getVectorNumElements(); SmallVector ShufMask(NumElems, -1); for (unsigned i = 0; i != NumElems/2; ++i) ShufMask[i] = i + NumElems/2; SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } /// Change a vector store into a pair of half-size vector stores. static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { SDValue StoredVal = Store->getValue(); assert((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"); // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. Assume the input store is legal (this transform is // only used for targets with AVX). Note: It is possible that we have an // illegal type like v2i128, and so we could allow splitting a volatile store // in that case if that is important. if (!Store->isSimple()) return SDValue(); SDLoc DL(Store); SDValue Value0, Value1; std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL); unsigned HalfOffset = Value0.getValueType().getStoreSize(); SDValue Ptr0 = Store->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL); SDValue Ch0 = DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), Store->getOriginalAlign(), Store->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, Store->getPointerInfo().getWithOffset(HalfOffset), Store->getOriginalAlign(), Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); } /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar /// type. static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG) { SDValue StoredVal = Store->getValue(); assert(StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); StoredVal = DAG.getBitcast(StoreVT, StoredVal); // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). if (!Store->isSimple()) return SDValue(); MVT StoreSVT = StoreVT.getScalarType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned ScalarSize = StoreSVT.getStoreSize(); SDLoc DL(Store); SmallVector Stores; for (unsigned i = 0; i != NumElems; ++i) { unsigned Offset = i * ScalarSize; SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), TypeSize::getFixed(Offset), DL); SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, DAG.getIntPtrConstant(i, DL)); SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, Store->getPointerInfo().getWithOffset(Offset), Store->getOriginalAlign(), Store->getMemOperand()->getFlags()); Stores.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); } static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(Op.getNode()); SDLoc dl(St); SDValue StoredVal = St->getValue(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. if (StoredVal.getValueType().isVector() && StoredVal.getValueType().getVectorElementType() == MVT::i1) { unsigned NumElts = StoredVal.getValueType().getVectorNumElements(); assert(NumElts <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); // We must pad with zeros to ensure we store zeroes to any unused bits. StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getUNDEF(MVT::v16i1), StoredVal, DAG.getIntPtrConstant(0, dl)); StoredVal = DAG.getBitcast(MVT::i16, StoredVal); StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); // Make sure we store zeros in the extra bits. if (NumElts < 8) StoredVal = DAG.getZeroExtendInReg( StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } if (St->isTruncatingStore()) return SDValue(); // If this is a 256-bit store of concatenated ops, we are better off splitting // that store into two 128-bit stores. This avoids spurious use of 256-bit ops // and each half can execute independently. Some cores would split the op into // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); if (StoreVT.is256BitVector() || ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) && !Subtarget.hasBWI())) { if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG)) return splitVectorStore(St, DAG); return SDValue(); } if (StoreVT.is32BitVector()) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(StoreVT.is64BitVector() && "Unexpected VT"); assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"); EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); if (Subtarget.hasSSE2()) { // Widen the vector, cast to a v2x64 type, extract the single 64-bit element // and store it. MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; MVT CastVT = MVT::getVectorVT(StVT, 2); StoredVal = DAG.getBitcast(CastVT, StoredVal); StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } assert(Subtarget.hasSSE1() && "Expected SSE"); SDVTList Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, St->getMemOperand()); } // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector loads."); assert(RegVT.isInteger() && "We only custom lower integer vector loads."); LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); // Replace chain users with the new chain. assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!"); SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd); Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT, DAG.getBitcast(MVT::v16i1, Val), DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } return SDValue(); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes /// each of which has no other use apart from the AND / OR. static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Opc = Op.getOpcode(); if (Opc != ISD::OR && Opc != ISD::AND) return false; return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && Op.getOperand(0).hasOneUse() && Op.getOperand(1).getOpcode() == X86ISD::SETCC && Op.getOperand(1).hasOneUse()); } SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); // Bail out when we don't have native compare instructions. if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0).getValueType() != MVT::f128 && !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); ISD::CondCode CC = cast(Cond.getOperand(2))->get(); // Special case for // setcc([su]{add,sub,mul}o == 0) // setcc([su]{add,sub,mul}o != 1) if (ISD::isOverflowIntrOpRes(LHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && (isNullConstant(RHS) || isOneConstant(RHS))) { SDValue Value, Overflow; X86::CondCode X86Cond; std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG); if ((CC == ISD::SETEQ) == isNullConstant(RHS)) X86Cond = X86::GetOppositeBranchCondition(X86Cond); SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Overflow); } if (LHS.getSimpleValueType().isInteger()) { SDValue CCVal; SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS); } if (CC == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't // have a fall-through edge, because this requires an explicit // jmp when the condition is false. if (Op.getNode()->hasOneUse()) { SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order // to implement FCMP_OEQ. if (User->getOpcode() == ISD::BR) { SDValue FalseBB = User->getOperand(1); SDNode *NewBR = DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); assert(NewBR == User); (void)NewBR; Dest = FalseBB; SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } } } else if (CC == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit OR instruction with a // separate test. SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } else { X86::CondCode X86Cond = TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG); SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } } if (ISD::isOverflowIntrOpRes(Cond)) { SDValue Value, Overflow; X86::CondCode X86Cond; std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Overflow); } // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); EVT CondVT = Cond.getValueType(); // Add an AND with 1 if we don't already have one. if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))) Cond = DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT)); SDValue LHS = Cond; SDValue RHS = DAG.getConstant(0, dl, CondVT); SDValue CCVal; SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal); return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. // Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); bool EmitStackProbeCall = hasStackProbeSymbol(MF); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || SplitStack || EmitStackProbeCall; SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); MaybeAlign Alignment(Op.getConstantOperandVal(2)); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Align StackAlign = TFI.getStackAlign(); if (hasInlineStackProbe(MF)) { MachineRegisterInfo &MRI = MF.getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); Register Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value } if (Alignment && *Alignment > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. const Function &F = MF.getFunction(); for (const auto &A : F.args()) { if (A.hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); } } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); Register Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); } else { SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size); MF.getInfo()->setHasDynAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); Register SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); if (Alignment) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } Result = SP; } Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); SDValue Ops[2] = {Result, Chain}; return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const Value *SV = cast(Op.getOperand(2))->getValue(); SDLoc DL(Op); if (!Subtarget.is64Bit() || Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } // __va_list_tag: // gp_offset (0 - 6 * 8) // fp_offset (48 - 48 + 8 * 16) // overflow_arg_area (point to parameters coming in memory). // reg_save_area SmallVector MemOps; SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV)); MemOps.push_back(Store); // Store fp_offset FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL); Store = DAG.getStore( Op.getOperand(0), DL, DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, MachinePointerInfo(SV, 4)); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8)); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); Store = DAG.getStore( Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12)); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); unsigned Align = Op.getConstantOperandVal(3); SDLoc dl(Op); EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"); if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. } else { assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ && "Unhandled argument type in LowerVAARG"); ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } if (ArgMode == 2) { // Make sure using fp_offset makes sense. assert(!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()); } // Insert VAARG node into the DAG // VAARG returns two values: Variable Argument Address, Chain SDValue InstOps[] = {Chain, SrcPtr, DAG.getTargetConstant(ArgSize, dl, MVT::i32), DAG.getTargetConstant(ArgMode, dl, MVT::i8), DAG.getTargetConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode( Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), /*Alignment=*/std::nullopt, MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo()); } static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, // where a va_list is still an i8*. assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget.isCallingConvWin64( DAG.getMachineFunction().getFunction().getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); const Value *DstSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); SDLoc DL(Op); return DAG.getMemcpy( Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL), Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false, /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } // Helper to get immediate/variable SSE shift opcode from other shift opcodes. static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) { switch (Opc) { case ISD::SHL: case X86ISD::VSHL: case X86ISD::VSHLI: return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI; case ISD::SRL: case X86ISD::VSRL: case X86ISD::VSRLI: return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI; case ISD::SRA: case X86ISD::VSRA: case X86ISD::VSRAI: return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI; } llvm_unreachable("Unknown target vector shift node"); } /// Handle vector element shifts where the shift amount is a constant. /// Takes immediate version of shift as input. static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); // Bitcast the source vector to the output type, this is mainly necessary for // vXi8/vXi64 shifts. if (VT != SrcOp.getSimpleValueType()) SrcOp = DAG.getBitcast(VT, SrcOp); // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; // Check for ShiftAmt >= element width if (ShiftAmt >= ElementType.getSizeInBits()) { if (Opc == X86ISD::VSRAI) ShiftAmt = ElementType.getSizeInBits() - 1; else return DAG.getConstant(0, dl, VT); } assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a // vector of Constants or UNDEFs. if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { unsigned ShiftOpc; switch (Opc) { default: llvm_unreachable("Unknown opcode!"); case X86ISD::VSHLI: ShiftOpc = ISD::SHL; break; case X86ISD::VSRLI: ShiftOpc = ISD::SRL; break; case X86ISD::VSRAI: ShiftOpc = ISD::SRA; break; } SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT); if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt})) return C; } return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getTargetConstant(ShiftAmt, dl, MVT::i8)); } /// Handle vector element shifts by a splat shift amount static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT AmtVT = ShAmt.getSimpleValueType(); assert(AmtVT.isVector() && "Vector shift type mismatch"); assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"); // Move the splat element to the bottom element. if (ShAmtIdx != 0) { SmallVector Mask(AmtVT.getVectorNumElements(), -1); Mask[0] = ShAmtIdx; ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask); } // Peek through any zext node if we can get back to a 128-bit source. if (AmtVT.getScalarSizeInBits() == 64 && (ShAmt.getOpcode() == ISD::ZERO_EXTEND || ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && ShAmt.getOperand(0).getValueType().isSimple() && ShAmt.getOperand(0).getValueType().is128BitVector()) { ShAmt = ShAmt.getOperand(0); AmtVT = ShAmt.getSimpleValueType(); } // See if we can mask off the upper elements using the existing source node. // The shift uses the entire lower 64-bits of the amount vector, so no need to // do this for vXi64 types. bool IsMasked = false; if (AmtVT.getScalarSizeInBits() < 64) { if (ShAmt.getOpcode() == ISD::BUILD_VECTOR || ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) { // If the shift amount has come from a scalar, then zero-extend the scalar // before moving to the vector. ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt); ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt); AmtVT = MVT::v4i32; IsMasked = true; } else if (ShAmt.getOpcode() == ISD::AND) { // See if the shift amount is already masked (e.g. for rotation modulo), // then we can zero-extend it by setting all the other mask elements to // zero. SmallVector MaskElts( AmtVT.getVectorNumElements(), DAG.getConstant(0, dl, AmtVT.getScalarType())); MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType()); SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts); if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT, {ShAmt.getOperand(1), Mask}))) { ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask); IsMasked = true; } } } // Extract if the shift amount vector is larger than 128-bits. if (AmtVT.getSizeInBits() > 128) { ShAmt = extract128BitVector(ShAmt, 0, DAG, dl); AmtVT = ShAmt.getSimpleValueType(); } // Zero-extend bottom element to v2i64 vector type, either by extension or // shuffle masking. if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) { if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST || ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) { ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt); } else if (Subtarget.hasSSE41()) { ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), MVT::v2i64, ShAmt); } else { SDValue ByteShift = DAG.getTargetConstant( (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt); ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, ByteShift); ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, ByteShift); } } // Change opcode to non-immediate version. Opc = getTargetVShiftUniformOpcode(Opc, true); // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } /// Return Mask with the necessary casting or extending /// for \p Mask according to \p MaskVT when lowering masking intrinsics static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { if (isAllOnesConstant(Mask)) return DAG.getConstant(1, dl, MaskVT); if (X86::isZeroNode(Mask)) return DAG.getConstant(0, dl, MaskVT); assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"); if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) { assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!"); assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); // In case 32bit mode, bitcast i64 is illegal, extend/split it. SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32); Lo = DAG.getBitcast(MVT::v32i1, Lo); Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } else { MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, DAG.getBitcast(BitcastVT, Mask), DAG.getIntPtrConstant(0, dl)); } } /// Return (and \p Op, \p Mask) for compare instructions or /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting or extending for \p Mask when lowering masking intrinsics static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); unsigned OpcodeSelect = ISD::VSELECT; SDLoc dl(Op); if (isAllOnesConstant(Mask)) return Op; SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). /// The mask is coming as MVT::i8 and it should be transformed /// to MVT::v1i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using /// "X86select" instead of "vselect". We just can't create the "vselect" node /// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (auto *MaskConst = dyn_cast(Mask)) if (MaskConst->getZExtValue() & 0x1) return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert(Mask.getValueType() == MVT::i8 && "Unexpect type"); SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1, DAG.getBitcast(MVT::v8i1, Mask), DAG.getIntPtrConstant(0, dl)); if (Op.getOpcode() == X86ISD::FSETCCM || Op.getOpcode() == X86ISD::FSETCCM_SAE || Op.getOpcode() == X86ISD::VFPCLASSS) return DAG.getNode(ISD::AND, dl, VT, Op, IMask); if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { if (!Fn->hasPersonalityFn()) report_fatal_error( "querying registration node size for function without personality"); // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See // WinEHStatePass for the full struct definition. switch (classifyEHPersonality(Fn->getPersonalityFn())) { case EHPersonality::MSVC_X86SEH: return 24; case EHPersonality::MSVC_CXX: return 16; default: break; } report_fatal_error( "can only recover FP for 32-bit MSVC EH personality functions"); } /// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize /// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP) { MachineFunction &MF = DAG.getMachineFunction(); SDLoc dl; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // It's possible that the parent function no longer has a personality function // if the exceptional code was optimized away, in which case we just return // the incoming EBP. if (!Fn->hasPersonalityFn()) return EntryEBP; // Get an MCSymbol that will ultimately resolve to the frame offset of the EH // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after // prologue to RBP in the parent function. const X86Subtarget &Subtarget = DAG.getSubtarget(); if (Subtarget.is64Bit()) return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { if (auto *C = dyn_cast(Rnd)) return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; return false; }; auto isRoundModeSAE = [](SDValue Rnd) { if (auto *C = dyn_cast(Rnd)) { unsigned RC = C->getZExtValue(); if (RC & X86::STATIC_ROUNDING::NO_EXC) { // Clear the NO_EXC bit and check remaining bits. RC ^= X86::STATIC_ROUNDING::NO_EXC; // As a convenience we allow no other bits or explicitly // current direction. return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION; } } return false; }; auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) { if (auto *C = dyn_cast(Rnd)) { RC = C->getZExtValue(); if (RC & X86::STATIC_ROUNDING::NO_EXC) { // Clear the NO_EXC bit and check remaining bits. RC ^= X86::STATIC_ROUNDING::NO_EXC; return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT || RC == X86::STATIC_ROUNDING::TO_NEG_INF || RC == X86::STATIC_ROUNDING::TO_POS_INF || RC == X86::STATIC_ROUNDING::TO_ZERO; } } return false; }; SDLoc dl(Op); unsigned IntNo = Op.getConstantOperandVal(0); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); // Propagate flags from original node to transformed node(s). SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags()); if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: { // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(2); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Op.getOperand(1), DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); } case INTR_TYPE_1OP_SAE: { SDValue Sae = Op.getOperand(2); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1)); } case INTR_TYPE_2OP: { SDValue Src2 = Op.getOperand(2); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(3); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Op.getOperand(1), Src2, DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Src2); } case INTR_TYPE_2OP_SAE: { SDValue Sae = Op.getOperand(3); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case INTR_TYPE_3OP: case INTR_TYPE_3OP_IMM8: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); if (IntrData->Type == INTR_TYPE_3OP_IMM8 && Src3.getValueType() != MVT::i8) { Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8); } // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src1, Src2, Src3}); } case INTR_TYPE_4OP_IMM8: { assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant); SDValue Src4 = Op.getOperand(4); if (Src4.getValueType() != MVT::i8) { Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), Src4); } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); // We add rounding mode to the Node when // - RC Opcode is specified and // - RC is not "current direction". unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; if (IntrWithRoundingModeOpcode != 0) { SDValue Rnd = Op.getOperand(4); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return getVectorMaskingNode( DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, DAG.getTargetConstant(RC, dl, MVT::i32)), Mask, PassThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return getVectorMaskingNode( DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK_SAE: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Rnd = Op.getOperand(4); unsigned Opc; if (isRoundModeCurDirection(Rnd)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Rnd)) Opc = IntrData->Opc1; else return SDValue(); return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; // There are 2 kinds of intrinsics in this group: // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. bool HasRounding = IntrWithRoundingModeOpcode != 0; if (Op.getNumOperands() == (5U + HasRounding)) { if (HasRounding) { SDValue Rnd = Op.getOperand(5); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) return getScalarMaskingNode( DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, DAG.getTargetConstant(RC, dl, MVT::i32)), Mask, passThru, Subtarget, DAG); if (!isRoundModeCurDirection(Rnd)) return SDValue(); } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } assert(Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"); SDValue RoundingMode = Op.getOperand(5); unsigned Opc = IntrData->Opc0; if (HasRounding) { SDValue Sae = Op.getOperand(6); if (isRoundModeSAE(Sae)) Opc = IntrWithRoundingModeOpcode; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RND: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Rnd = Op.getOperand(5); SDValue NewOp; unsigned RC = 0; if (isRoundModeCurDirection(Rnd)) NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); else if (isRoundModeSAEToX(Rnd, RC)) NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, DAG.getTargetConstant(RC, dl, MVT::i32)); else return SDValue(); return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Sae = Op.getOperand(5); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue NewOp; if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, DAG.getTargetConstant(RC, dl, MVT::i32)); else if (!isRoundModeCurDirection(Rnd)) return SDValue(); } if (!NewOp) NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned Opc = IntrData->Opc0; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(5); if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_SCALAR_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Sae = Op.getOperand(6); unsigned Opc; if (isRoundModeCurDirection(Sae)) Opc = IntrData->Opc0; else if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else return SDValue(); return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_MASK_SAE: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); unsigned Opc = IntrData->Opc0; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(6); if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } case BLENDV: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); Src3 = DAG.getBitcast(MaskVT, Src3); // Reverse the operands to match VSELECT order. return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); // Swap Src1 and Src2 in the node creation return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); } case CFMA_OP_MASKZ: case CFMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); SDValue PassThru = Src3; if (IntrData->Type == CFMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); // We add rounding mode to the Node when // - RC Opcode is specified and // - RC is not "current direction". SDValue NewOp; if (IntrData->Opc1 != 0) { SDValue Rnd = Op.getOperand(5); unsigned RC = 0; if (isRoundModeSAEToX(Rnd, RC)) NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3, DAG.getTargetConstant(RC, dl, MVT::i32)); else if (!isRoundModeCurDirection(Rnd)) return SDValue(); } if (!NewOp) NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3); return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); } case IFMA_OP: // NOTE: We need to swizzle the operands to pass the multiply operands // first. return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case FPCLASSS: { SDValue Src1 = Op.getOperand(1); SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, DAG.getConstant(0, dl, MVT::v8i1), FPclassMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(MVT::i8, Ins); } case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); SDValue CC = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(5); if (isRoundModeSAE(Sae)) return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), Op.getOperand(2), CC, Mask, Sae); if (!isRoundModeCurDirection(Sae)) return SDValue(); } //default rounding mode return DAG.getNode(IntrData->Opc0, dl, MaskVT, {Op.getOperand(1), Op.getOperand(2), CC, Mask}); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue CC = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Cmp; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(5); if (isRoundModeSAE(Sae)) Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae); else if (!isRoundModeCurDirection(Sae)) return SDValue(); } //default rounding mode if (!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, DAG.getConstant(0, dl, MVT::v8i1), CmpMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(MVT::i8, Ins); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); // Some conditions require the operands to be swapped. if (CC == ISD::SETLT || CC == ISD::SETLE) std::swap(LHS, RHS); SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); SDValue SetCC; switch (CC) { case ISD::SETEQ: { // (ZF = 0 and PF = 0) SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); break; } case ISD::SETNE: { // (ZF = 1 or PF = 1) SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); break; } case ISD::SETGT: // (CF = 0 and ZF = 0) case ISD::SETLT: { // Condition opposite to GT. Operands swapped above. SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); break; } case ISD::SETGE: // CF = 0 case ISD::SETLE: // Condition opposite to GE. Operands swapped above. SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG); break; default: llvm_unreachable("Unexpected illegal condition!"); } return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); unsigned CondVal = Op.getConstantOperandVal(3); SDValue Sae = Op.getOperand(4); SDValue FCmp; if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, DAG.getTargetConstant(CondVal, dl, MVT::i8)); else if (isRoundModeSAE(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae); else return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, DAG.getConstant(0, dl, MVT::v16i1), FCmp, DAG.getIntPtrConstant(0, dl)); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, DAG.getBitcast(MVT::i16, Ins)); } case VSHIFT: { SDValue SrcOp = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); assert(ShAmt.getValueType() == MVT::i32 && "Unexpected VSHIFT amount type"); // Catch shift-by-constant. if (auto *CShAmt = dyn_cast(ShAmt)) return getTargetVShiftByConstNode(IntrData->Opc0, dl, Op.getSimpleValueType(), SrcOp, CShAmt->getZExtValue(), DAG); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt); return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), SrcOp, ShAmt, 0, Subtarget, DAG); } case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is return Op.getOperand(1); // Avoid false dependency. if (PassThru.isUndef()) PassThru = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru, Mask); } case FIXUPIMM: case FIXUPIMM_MASKZ: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Imm = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Passthru = (IntrData->Type == FIXUPIMM) ? Src1 : getZeroVector(VT, Subtarget, DAG, dl); unsigned Opc = IntrData->Opc0; if (IntrData->Opc1 != 0) { SDValue Sae = Op.getOperand(6); if (isRoundModeSAE(Sae)) Opc = IntrData->Opc1; else if (!isRoundModeCurDirection(Sae)) return SDValue(); } SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm); if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE) return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); } case ROUNDP: { assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. uint64_t Round = Op.getConstantOperandVal(2); SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), RoundingMode); } case ROUNDS: { assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. uint64_t Round = Op.getConstantOperandVal(3); SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), RoundingMode); } case BEXTRI: { assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode"); uint64_t Imm = Op.getConstantOperandVal(2); SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl, Op.getValueType()); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Control); } // ADC/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32); SDValue Res; // If the carry in is zero, then we should just use ADD/SUB instead of // ADC/SBB. if (isNullConstant(Op.getOperand(1))) { Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2), Op.getOperand(3)); } else { SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1), DAG.getConstant(-1, dl, MVT::i8)); Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2), Op.getOperand(3), GenCF.getValue(1)); } SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); SDValue Results[] = { SetCC, Res }; return DAG.getMergeValues(Results, dl); } case CVTPD2PS_MASK: case CVTPD2DQ_MASK: case CVTQQ2PS_MASK: case TRUNCATE_TO_REG: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); if (isAllOnesConstant(Mask)) return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), {Src, PassThru, Mask}); } case CVTPS2PH_MASK: { SDValue Src = Op.getOperand(1); SDValue Rnd = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned RC = 0; unsigned Opc = IntrData->Opc0; bool SAE = Src.getValueType().is512BitVector() && (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd)); if (SAE) { Opc = X86ISD::CVTPS2PH_SAE; Rnd = DAG.getTargetConstant(RC, dl, MVT::i32); } if (isAllOnesConstant(Mask)) return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd); if (SAE) Opc = X86ISD::MCVTPS2PH_SAE; else Opc = IntrData->Opc1; MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask); } case CVTNEPS2BF16_MASK: { SDValue Src = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); if (ISD::isBuildVectorAllOnes(Mask.getNode())) return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); // Break false dependency. if (PassThru.isUndef()) PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, Mask); } default: break; } } switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest // or testp pattern and a setcc for the result. case Intrinsic::x86_avx512_ktestc_b: case Intrinsic::x86_avx512_ktestc_w: case Intrinsic::x86_avx512_ktestc_d: case Intrinsic::x86_avx512_ktestc_q: case Intrinsic::x86_avx512_ktestz_b: case Intrinsic::x86_avx512_ktestz_w: case Intrinsic::x86_avx512_ktestz_d: case Intrinsic::x86_avx512_ktestz_q: case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestz_256: case Intrinsic::x86_avx_ptestc_256: case Intrinsic::x86_avx_ptestnzc_256: case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: case Intrinsic::x86_avx_vtestc_pd_256: case Intrinsic::x86_avx_vtestnzc_pd_256: { unsigned TestOpc = X86ISD::PTEST; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); case Intrinsic::x86_avx512_ktestc_b: case Intrinsic::x86_avx512_ktestc_w: case Intrinsic::x86_avx512_ktestc_d: case Intrinsic::x86_avx512_ktestc_q: // CF = 1 TestOpc = X86ISD::KTEST; X86CC = X86::COND_B; break; case Intrinsic::x86_avx512_ktestz_b: case Intrinsic::x86_avx512_ktestz_w: case Intrinsic::x86_avx512_ktestz_d: case Intrinsic::x86_avx512_ktestz_q: TestOpc = X86ISD::KTEST; X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestz_ps: case Intrinsic::x86_avx_vtestz_pd: case Intrinsic::x86_avx_vtestz_ps_256: case Intrinsic::x86_avx_vtestz_pd_256: TestOpc = X86ISD::TESTP; [[fallthrough]]; case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_avx_ptestz_256: // ZF = 1 X86CC = X86::COND_E; break; case Intrinsic::x86_avx_vtestc_ps: case Intrinsic::x86_avx_vtestc_pd: case Intrinsic::x86_avx_vtestc_ps_256: case Intrinsic::x86_avx_vtestc_pd_256: TestOpc = X86ISD::TESTP; [[fallthrough]]; case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_avx_ptestc_256: // CF = 1 X86CC = X86::COND_B; break; case Intrinsic::x86_avx_vtestnzc_ps: case Intrinsic::x86_avx_vtestnzc_pd: case Intrinsic::x86_avx_vtestnzc_ps_256: case Intrinsic::x86_avx_vtestnzc_pd_256: TestOpc = X86ISD::TESTP; [[fallthrough]]; case Intrinsic::x86_sse41_ptestnzc: case Intrinsic::x86_avx_ptestnzc_256: // ZF and CF = 0 X86CC = X86::COND_A; break; } SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: case Intrinsic::x86_sse42_pcmpestric128: case Intrinsic::x86_sse42_pcmpistrio128: case Intrinsic::x86_sse42_pcmpestrio128: case Intrinsic::x86_sse42_pcmpistris128: case Intrinsic::x86_sse42_pcmpestris128: case Intrinsic::x86_sse42_pcmpistriz128: case Intrinsic::x86_sse42_pcmpestriz128: { unsigned Opcode; X86::CondCode X86CC; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_sse42_pcmpistria128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpestria128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_A; break; case Intrinsic::x86_sse42_pcmpistric128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpestric128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_B; break; case Intrinsic::x86_sse42_pcmpistrio128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpestrio128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_O; break; case Intrinsic::x86_sse42_pcmpistris128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpestris128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_S; break; case Intrinsic::x86_sse42_pcmpistriz128: Opcode = X86ISD::PCMPISTR; X86CC = X86::COND_E; break; case Intrinsic::x86_sse42_pcmpestriz128: Opcode = X86ISD::PCMPESTR; X86CC = X86::COND_E; break; } SmallVector NewOps(llvm::drop_begin(Op->ops())); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2); SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } case Intrinsic::x86_sse42_pcmpistri128: case Intrinsic::x86_sse42_pcmpestri128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistri128) Opcode = X86ISD::PCMPISTR; else Opcode = X86ISD::PCMPESTR; SmallVector NewOps(llvm::drop_begin(Op->ops())); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } case Intrinsic::x86_sse42_pcmpistrm128: case Intrinsic::x86_sse42_pcmpestrm128: { unsigned Opcode; if (IntNo == Intrinsic::x86_sse42_pcmpistrm128) Opcode = X86ISD::PCMPISTR; else Opcode = X86ISD::PCMPESTR; SmallVector NewOps(llvm::drop_begin(Op->ops())); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); auto &Context = MF.getContext(); MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + Twine(MF.getFunctionNumber())); return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT, DAG.getMCSymbol(S, PtrVT)); } case Intrinsic::x86_seh_lsda: { // Compute the symbol for the LSDA. We know it'll get emitted later. MachineFunction &MF = DAG.getMachineFunction(); SDValue Op1 = Op.getOperand(1); auto *Fn = cast(cast(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getContext().getOrCreateLSDASymbol( GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. SDValue Result = DAG.getMCSymbol(LSDASym, VT); return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); } case Intrinsic::eh_recoverfp: { SDValue FnOp = Op.getOperand(1); SDValue IncomingFPOp = Op.getOperand(2); GlobalAddressSDNode *GSD = dyn_cast(FnOp); auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); if (!Fn) report_fatal_error( "llvm.eh.recoverfp must take a function as the first argument"); return recoverFramePointer(DAG, Fn, IncomingFPOp); } case Intrinsic::localaddress: { // Returns one of the stack, base, or frame pointer registers, depending on // which is used to reference local variables. MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned Reg; if (RegInfo->hasBasePointer(MF)) Reg = RegInfo->getBaseRegister(); else { // Handles the SP or FP case. bool CantUseFP = RegInfo->hasStackRealignment(MF); if (CantUseFP) Reg = RegInfo->getPtrSizedStackRegister(MF); else Reg = RegInfo->getPtrSizedFrameRegister(MF); } return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } case Intrinsic::x86_avx512_vp2intersect_q_512: case Intrinsic::x86_avx512_vp2intersect_q_256: case Intrinsic::x86_avx512_vp2intersect_q_128: case Intrinsic::x86_avx512_vp2intersect_d_512: case Intrinsic::x86_avx512_vp2intersect_d_256: case Intrinsic::x86_avx512_vp2intersect_d_128: { MVT MaskVT = Op.getSimpleValueType(); SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); SDLoc DL(Op); SDValue Operation = DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, Op->getOperand(1), Op->getOperand(2)); SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation); SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation); return DAG.getMergeValues({Result0, Result1}, DL); } case Intrinsic::x86_mmx_pslli_w: case Intrinsic::x86_mmx_pslli_d: case Intrinsic::x86_mmx_pslli_q: case Intrinsic::x86_mmx_psrli_w: case Intrinsic::x86_mmx_psrli_d: case Intrinsic::x86_mmx_psrli_q: case Intrinsic::x86_mmx_psrai_w: case Intrinsic::x86_mmx_psrai_d: { SDLoc DL(Op); SDValue ShAmt = Op.getOperand(2); // If the argument is a constant, convert it to a target constant. if (auto *C = dyn_cast(ShAmt)) { // Clamp out of bounds shift amounts since they will otherwise be masked // to 8-bits which may make it no longer out of bounds. unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255); if (ShiftAmount == 0) return Op.getOperand(1); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), Op.getOperand(0), Op.getOperand(1), DAG.getTargetConstant(ShiftAmount, DL, MVT::i32)); } unsigned NewIntrinsic; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. case Intrinsic::x86_mmx_pslli_w: NewIntrinsic = Intrinsic::x86_mmx_psll_w; break; case Intrinsic::x86_mmx_pslli_d: NewIntrinsic = Intrinsic::x86_mmx_psll_d; break; case Intrinsic::x86_mmx_pslli_q: NewIntrinsic = Intrinsic::x86_mmx_psll_q; break; case Intrinsic::x86_mmx_psrli_w: NewIntrinsic = Intrinsic::x86_mmx_psrl_w; break; case Intrinsic::x86_mmx_psrli_d: NewIntrinsic = Intrinsic::x86_mmx_psrl_d; break; case Intrinsic::x86_mmx_psrli_q: NewIntrinsic = Intrinsic::x86_mmx_psrl_q; break; case Intrinsic::x86_mmx_psrai_w: NewIntrinsic = Intrinsic::x86_mmx_psra_w; break; case Intrinsic::x86_mmx_psrai_d: NewIntrinsic = Intrinsic::x86_mmx_psra_d; break; } // The vector shift intrinsics with scalars uses 32b shift amounts but // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an // MMX register. ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), DAG.getTargetConstant(NewIntrinsic, DL, getPointerTy(DAG.getDataLayout())), Op.getOperand(1), ShAmt); } case Intrinsic::thread_pointer: { if (Subtarget.isTargetELF()) { SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). Value *Ptr = Constant::getNullValue(PointerType::get( *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS)); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); } report_fatal_error( "Target OS doesn't support __builtin_thread_pointer() yet."); } } } static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, TLI.getPointerTy(DAG.getDataLayout())); EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); // Cast mask to an integer type. Mask = DAG.getBitcast(MaskVT, Mask); MemIntrinsicSDNode *MemIntr = cast(Op); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return DAG.getMergeValues({Res, Res.getValue(1)}, dl); } static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, TLI.getPointerTy(DAG.getDataLayout())); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), VT.getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); // We support two versions of the gather intrinsics. One with scalar mask and // one with vXi1 mask. Convert scalar to vXi1 if necessary. if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); MemIntrinsicSDNode *MemIntr = cast(Op); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return DAG.getMergeValues({Res, Res.getValue(1)}, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, TLI.getPointerTy(DAG.getDataLayout())); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); // We support two versions of the scatter intrinsics. One with scalar mask and // one with vXi1 mask. Convert scalar to vXi1 if necessary. if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); MemIntrinsicSDNode *MemIntr = cast(Op); SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); return Res; } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); auto *C = dyn_cast(ScaleOp); // Scale must be constant. if (!C) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, TLI.getPointerTy(DAG.getDataLayout())); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); } /// Handles the lowering of builtin intrinsics with chain that return their /// value into registers EDX:EAX. /// If operand ScrReg is a valid register identifier, then operand 2 of N is /// copied to SrcReg. The assumption is that SrcReg is an implicit input to /// TargetOpcode. /// Returns a Glue value which can be used to add extra copy-from-reg if the /// expanded intrinsics implicitly defines extra registers (i.e. not just /// EDX:EAX). static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { SDValue Chain = N->getOperand(0); SDValue Glue; if (SrcReg) { assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue); Glue = Chain.getValue(1); } SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue N1Ops[] = {Chain, Glue}; SDNode *N1 = DAG.getMachineNode( TargetOpcode, DL, Tys, ArrayRef(N1Ops, Glue.getNode() ? 2 : 1)); Chain = SDValue(N1, 0); // Reads the content of XCR and returns it in registers EDX:EAX. SDValue LO, HI; if (Subtarget.is64Bit()) { LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, LO.getValue(2)); } else { LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1)); HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, LO.getValue(2)); } Chain = HI.getValue(1); Glue = HI.getValue(2); if (Subtarget.is64Bit()) { // Merge the two 32-bit values into a 64-bit one. SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, DAG.getConstant(32, DL, MVT::i8)); Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); Results.push_back(Chain); return Glue; } // Use a buildpair to merge the two 32-bit values into a 64-bit one. SDValue Ops[] = { LO, HI }; SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); Results.push_back(Pair); Results.push_back(Chain); return Glue; } /// Handles the lowering of builtin intrinsics that read the time stamp counter /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower /// READCYCLECOUNTER nodes. static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl &Results) { // The processor's time-stamp counter (a 64-bit MSR) is stored into the // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR // and the EAX register is loaded with the low-order 32 bits. SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, /* NoRegister */0, Subtarget, Results); if (Opcode != X86::RDTSCP) return; SDValue Chain = Results[1]; // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into // the ECX register. Add 'ecx' explicitly to the chain. SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue); Results[1] = ecx; Results.push_back(ecx.getValue(1)); } static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector Results; SDLoc DL(Op); getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget, Results); return DAG.getMergeValues(Results, DL); } static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue RegNode = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EH registrations only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(RegNode); if (!FINode) report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); SDValue EHGuard = Op.getOperand(2); WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); if (!EHInfo) report_fatal_error("EHGuard only live in functions using WinEH"); // Cast the operand to an alloca, and remember the frame index. auto *FINode = dyn_cast(EHGuard); if (!FINode) report_fatal_error("llvm.x86.seh.ehguard expects a static alloca"); EHInfo->EHGuardFrameIndex = FINode->getIndex(); // Return the chain operand without making any DAG nodes. return Chain; } /// Emit Truncating Store with signed or unsigned saturation. static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS; return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO); } /// Emit Masked Truncating Store with signed or unsigned saturation. static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Val, Ptr, Mask }; unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS; return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO); } bool X86::isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF) { if (!Subtarget.is64Bit()) return false; // 64-bit targets support extended Swift async frame setup, // except for targets that use the windows 64 prologue. return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IntNo = Op.getConstantOperandVal(1); const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { case Intrinsic::swift_async_context_addr: { SDLoc dl(Op); auto &MF = DAG.getMachineFunction(); auto *X86FI = MF.getInfo(); if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) { MF.getFrameInfo().setFrameAddressIsTaken(true); X86FI->setHasSwiftAsyncContext(true); SDValue Chain = Op->getOperand(0); SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64); SDValue Result = SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP, DAG.getTargetConstant(8, dl, MVT::i32)), 0); // Return { result, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, CopyRBP.getValue(1)); } else { // No special extended frame, create or reuse an existing stack slot. int PtrSize = Subtarget.is64Bit() ? 8 : 4; if (!X86FI->getSwiftAsyncContextFrameIdx()) X86FI->setSwiftAsyncContextFrameIdx( MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize), false)); SDValue Result = DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), PtrSize == 8 ? MVT::i64 : MVT::i32); // Return { result, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, Op->getOperand(0)); } } case llvm::Intrinsic::x86_seh_ehregnode: return MarkEHRegistrationNode(Op, DAG); case llvm::Intrinsic::x86_seh_ehguard: return MarkEHGuard(Op, DAG); case llvm::Intrinsic::x86_rdpkru: { SDLoc dl(Op); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); // Create a RDPKRU node and pass 0 to the ECX parameter. return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0), DAG.getConstant(0, dl, MVT::i32)); } case llvm::Intrinsic::x86_wrpkru: { SDLoc dl(Op); // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 // to the EDX and ECX parameters. return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, Op.getOperand(0), Op.getOperand(2), DAG.getConstant(0, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); } case llvm::Intrinsic::asan_check_memaccess: { // Mark this as adjustsStack because it will be lowered to a call. DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true); // Don't do anything here, we will expand these intrinsics out later. return Op; } case llvm::Intrinsic::x86_flags_read_u32: case llvm::Intrinsic::x86_flags_read_u64: case llvm::Intrinsic::x86_flags_write_u32: case llvm::Intrinsic::x86_flags_write_u64: { // We need a frame pointer because this will get lowered to a PUSH/POP // sequence. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during FinalizeISel in EmitInstrWithCustomInserter. return Op; } case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: case Intrinsic::x86_umwait: case Intrinsic::x86_tpause: { SDLoc dl(Op); SDValue Chain = Op->getOperand(0); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_umwait: Opcode = X86ISD::UMWAIT; break; case Intrinsic::x86_tpause: Opcode = X86ISD::TPAUSE; break; case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: Opcode = X86ISD::LWPINS; break; } SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), Op->getOperand(3), Op->getOperand(4)); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } case Intrinsic::x86_enqcmd: case Intrinsic::x86_enqcmds: { SDLoc dl(Op); SDValue Chain = Op.getOperand(0); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic!"); case Intrinsic::x86_enqcmd: Opcode = X86ISD::ENQCMD; break; case Intrinsic::x86_enqcmds: Opcode = X86ISD::ENQCMDS; break; } SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2), Op.getOperand(3)); SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } case Intrinsic::x86_aesenc128kl: case Intrinsic::x86_aesdec128kl: case Intrinsic::x86_aesenc256kl: case Intrinsic::x86_aesdec256kl: { SDLoc DL(Op); SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other); SDValue Chain = Op.getOperand(0); unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_aesenc128kl: Opcode = X86ISD::AESENC128KL; break; case Intrinsic::x86_aesdec128kl: Opcode = X86ISD::AESDEC128KL; break; case Intrinsic::x86_aesenc256kl: Opcode = X86ISD::AESENC256KL; break; case Intrinsic::x86_aesdec256kl: Opcode = X86ISD::AESDEC256KL; break; } MemIntrinsicSDNode *MemIntr = cast(Op); MachineMemOperand *MMO = MemIntr->getMemOperand(); EVT MemVT = MemIntr->getMemoryVT(); SDValue Operation = DAG.getMemIntrinsicNode( Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT, MMO); SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), {ZF, Operation.getValue(0), Operation.getValue(2)}); } case Intrinsic::x86_aesencwide128kl: case Intrinsic::x86_aesdecwide128kl: case Intrinsic::x86_aesencwide256kl: case Intrinsic::x86_aesdecwide256kl: { SDLoc DL(Op); SDVTList VTs = DAG.getVTList( {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other}); SDValue Chain = Op.getOperand(0); unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_aesencwide128kl: Opcode = X86ISD::AESENCWIDE128KL; break; case Intrinsic::x86_aesdecwide128kl: Opcode = X86ISD::AESDECWIDE128KL; break; case Intrinsic::x86_aesencwide256kl: Opcode = X86ISD::AESENCWIDE256KL; break; case Intrinsic::x86_aesdecwide256kl: Opcode = X86ISD::AESDECWIDE256KL; break; } MemIntrinsicSDNode *MemIntr = cast(Op); MachineMemOperand *MMO = MemIntr->getMemOperand(); EVT MemVT = MemIntr->getMemoryVT(); SDValue Operation = DAG.getMemIntrinsicNode( Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), Op.getOperand(6), Op.getOperand(7), Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)}, MemVT, MMO); SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), {ZF, Operation.getValue(1), Operation.getValue(2), Operation.getValue(3), Operation.getValue(4), Operation.getValue(5), Operation.getValue(6), Operation.getValue(7), Operation.getValue(8), Operation.getValue(9)}); } case Intrinsic::x86_testui: { SDLoc dl(Op); SDValue Chain = Op.getOperand(0); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain); SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, Operation.getValue(1)); } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue Chain = Op.getOperand(0); SDValue Op1 = Op.getOperand(2); SDValue Op2 = Op.getOperand(3); unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM : X86ISD::LBTR_RM; MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Res = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), {Chain, Op1, Op2}, VT, MMO); Chain = Res.getValue(1); Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain); } case Intrinsic::x86_atomic_bts: case Intrinsic::x86_atomic_btc: case Intrinsic::x86_atomic_btr: { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue Chain = Op.getOperand(0); SDValue Op1 = Op.getOperand(2); SDValue Op2 = Op.getOperand(3); unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC : X86ISD::LBTR; SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Res = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), {Chain, Op1, Op2, Size}, VT, MMO); Chain = Res.getValue(1); Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); unsigned Imm = Op2->getAsZExtVal(); if (Imm) Res = DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getShiftAmountConstant(Imm, VT, DL)); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain); } case Intrinsic::x86_cmpccxadd32: case Intrinsic::x86_cmpccxadd64: { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(2); SDValue Src1 = Op.getOperand(3); SDValue Src2 = Op.getOperand(4); SDValue CC = Op.getOperand(5); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Operation = DAG.getMemIntrinsicNode( X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC}, MVT::i32, MMO); return Operation; } case Intrinsic::x86_aadd32: case Intrinsic::x86_aadd64: case Intrinsic::x86_aand32: case Intrinsic::x86_aand64: case Intrinsic::x86_aor32: case Intrinsic::x86_aor64: case Intrinsic::x86_axor32: case Intrinsic::x86_axor64: { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Op1 = Op.getOperand(2); SDValue Op2 = Op.getOperand(3); MVT VT = Op2.getSimpleValueType(); unsigned Opc = 0; switch (IntNo) { default: llvm_unreachable("Unknown Intrinsic"); case Intrinsic::x86_aadd32: case Intrinsic::x86_aadd64: Opc = X86ISD::AADD; break; case Intrinsic::x86_aand32: case Intrinsic::x86_aand64: Opc = X86ISD::AAND; break; case Intrinsic::x86_aor32: case Intrinsic::x86_aor64: Opc = X86ISD::AOR; break; case Intrinsic::x86_axor32: case Intrinsic::x86_axor64: Opc = X86ISD::AXOR; break; } MachineMemOperand *MMO = cast(Op)->getMemOperand(); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), {Chain, Op1, Op2}, VT, MMO); } case Intrinsic::x86_atomic_add_cc: case Intrinsic::x86_atomic_sub_cc: case Intrinsic::x86_atomic_or_cc: case Intrinsic::x86_atomic_and_cc: case Intrinsic::x86_atomic_xor_cc: { SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Op1 = Op.getOperand(2); SDValue Op2 = Op.getOperand(3); X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4); MVT VT = Op2.getSimpleValueType(); unsigned Opc = 0; switch (IntNo) { default: llvm_unreachable("Unknown Intrinsic"); case Intrinsic::x86_atomic_add_cc: Opc = X86ISD::LADD; break; case Intrinsic::x86_atomic_sub_cc: Opc = X86ISD::LSUB; break; case Intrinsic::x86_atomic_or_cc: Opc = X86ISD::LOR; break; case Intrinsic::x86_atomic_and_cc: Opc = X86ISD::LAND; break; case Intrinsic::x86_atomic_xor_cc: Opc = X86ISD::LXOR; break; } MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue LockArith = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), {Chain, Op1, Op2}, VT, MMO); Chain = LockArith.getValue(1); return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL); } } return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other); SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, dl, Op->getValueType(1)), DAG.getTargetConstant(X86::COND_B, dl, MVT::i8), SDValue(Result.getNode(), 1)}; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } case GATHER_AVX2: { SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); SDValue Src = Op.getOperand(2); SDValue Base = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { //scatter(base, mask, index, v1, scale); SDValue Chain = Op.getOperand(0); SDValue Base = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case PREFETCH: { const APInt &HintVal = Op.getConstantOperandAPInt(6); assert((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"); unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); SDValue Base = Op.getOperand(4); SDValue Scale = Op.getOperand(5); return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, Subtarget); } // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector Results; getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. case RDPMC: // Read Processor Register. case RDPRU: // GetExtended Control Register. case XGETBV: { SmallVector Results; // RDPMC uses ECX to select the index of the performance counter to read. // RDPRU uses ECX to select the processor register to read. // XGETBV uses ECX to select the index of the XCR register to return. // The result is stored into registers EDX:EAX. expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG); SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { SDValue Mask = Op.getOperand(4); SDValue DataToTruncate = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); MemIntrinsicSDNode *MemIntr = dyn_cast(Op); assert(MemIntr && "Expected MemIntrinsicSDNode!"); EVT MemVT = MemIntr->getMemoryVT(); uint16_t TruncationOp = IntrData->Opc0; switch (TruncationOp) { case X86ISD::VTRUNC: { if (isAllOnesConstant(Mask)) // return just a truncate store return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT, MemIntr->getMemOperand()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); SDValue Offset = DAG.getUNDEF(VMask.getValueType()); return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask, MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED, true /* truncating */); } case X86ISD::VTRUNCUS: case X86ISD::VTRUNCS: { bool IsSigned = (TruncationOp == X86ISD::VTRUNCS); if (isAllOnesConstant(Mask)) return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT, MemIntr->getMemOperand(), DAG); MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, VMask, MemVT, MemIntr->getMemOperand(), DAG); } default: llvm_unreachable("Unsupported truncstore intrinsic"); } } } } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); unsigned Depth = Op.getConstantOperandVal(0); SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo()); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo()); } SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const { DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true); return getReturnAddressFrameIndex(DAG); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); EVT VT = Op.getValueType(); MFI.setFrameAddressIsTaken(true); if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { // Depth > 0 makes no sense on targets which use Windows unwind codes. It // is not possible to crawl up the stack without looking at the unwind codes // simultaneously. int FrameAddrIndex = FuncInfo->getFAIndex(); if (!FrameAddrIndex) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false); FuncInfo->setFAIndex(FrameAddrIndex); } return DAG.getFrameIndex(FrameAddrIndex, VT); } unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = Op.getConstantOperandVal(0); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); Register Reg = StringSwitch(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Case("ebp", X86::EBP) .Case("rbp", X86::RBP) .Case("r14", X86::R14) .Case("r15", X86::R15) .Default(0); if (Reg == X86::EBP || Reg == X86::RBP) { if (!TFI.hasFP(MF)) report_fatal_error("register " + StringRef(RegName) + " is allocatable: function has no frame pointer"); #ifndef NDEBUG else { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"); } #endif } if (Reg) return Reg; report_fatal_error("Invalid register name global variable"); } SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } Register X86TargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; } Register X86TargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Funclet personalities don't use selectors (the runtime does the selection). if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))) return X86::NoRegister; return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; } bool X86TargetLowering::needsFixedCatchObjects() const { return Subtarget.isTargetWin64(); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); SDValue Handler = Op.getOperand(2); SDLoc dl (Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl)); StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo()); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, DAG.getRegister(StoreAddrReg, PtrVT)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // If the subtarget is not 64bit, we may need the global base reg // after isel expand pseudo, i.e., after CGBR pass ran. // Therefore, ask for the GlobalBaseReg now, so that the pass // inserts the code for us in case we need it. // Otherwise, we will end up in a situation where we will // reference a virtual register that is not defined! if (!Subtarget.is64Bit()) { const X86InstrInfo *TII = Subtarget.getInstrInfo(); (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); } return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0), Op.getOperand(1)); } SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, Op.getOperand(0)); } static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { SDValue Root = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value SDLoc dl (Op); const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (Subtarget.is64Bit()) { SDValue OutChains[6]; // Large code-model. const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix // Load the pointer to the nested function into R11. unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, dl, MVT::i64)); OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2), Align(2)); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, dl, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 10)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, dl, MVT::i64)); OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12), Align(2)); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, dl, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), Addr, MachinePointerInfo(TrmpAddr, 20)); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, dl, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 22)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } else { const Function *Func = cast(cast(Op.getOperand(5))->getValue()); CallingConv::ID CC = Func->getCallingConv(); unsigned NestReg; switch (CC) { default: llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::X86_StdCall: { // Pass 'nest' parameter in ECX. // Must be kept in sync with X86CallingConv.td NestReg = X86::ECX; // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); const AttributeList &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; unsigned Idx = 0; for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) if (Attrs.hasParamAttr(Idx, Attribute::InReg)) { const DataLayout &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" " parameters!"); } } break; } case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: case CallingConv::Tail: case CallingConv::SwiftTail: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; break; } SDValue OutChains[4]; SDValue Addr, Disp; Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(10, dl, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, dl, MVT::i32)); OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1), Align(1)); const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, dl, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, MachinePointerInfo(TrmpAddr, 5), Align(1)); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, dl, MVT::i32)); OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6), Align(1)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } } SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: 00 Round to nearest 01 Round to -inf 10 Round to +inf 11 Round to 0 GET_ROUNDING, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we use a packed lookup table of the four 2-bit values that we can index by FPSP[11:10] 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10] (0x2d >> ((FPSR & 0xc00) >> 9)) & 3 */ MachineFunction &MF = DAG.getMachineFunction(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); SDValue Chain = Op.getOperand(0); SDValue Ops[] = {Chain, StackSlot}; Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, Align(2), MachineMemOperand::MOStore); // Load FP Control Word from stack slot SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2)); Chain = CWD.getValue(1); // Mask and turn the control bits into a shift for the lookup table. SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, CWD, DAG.getConstant(0xc00, DL, MVT::i16)), DAG.getConstant(9, DL, MVT::i8)); Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift); SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32); SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i32, DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift), DAG.getConstant(3, DL, MVT::i32)); RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT); return DAG.getMergeValues({RetVal, Chain}, DL); } SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SDLoc DL(Op); SDValue Chain = Op.getNode()->getOperand(0); // FP control word may be set only from data in memory. So we need to allocate // stack space to save/load FP control word. int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false); SDValue StackSlot = DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout())); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx); MachineMemOperand *MMO = MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2)); // Store FP control word into memory. SDValue Ops[] = {Chain, StackSlot}; Chain = DAG.getMemIntrinsicNode( X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); // Load FP Control Word from stack slot and clear RM field (bits 11:10). SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI); Chain = CWD.getValue(1); CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0), DAG.getConstant(0xf3ff, DL, MVT::i16)); // Calculate new rounding mode. SDValue NewRM = Op.getNode()->getOperand(1); SDValue RMBits; if (auto *CVal = dyn_cast(NewRM)) { uint64_t RM = CVal->getZExtValue(); int FieldVal; switch (static_cast(RM)) { // clang-format off case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break; case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break; case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break; case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break; default: llvm_unreachable("rounding mode is not supported by X86 hardware"); // clang-format on } RMBits = DAG.getConstant(FieldVal, DL, MVT::i16); } else { // Need to convert argument into bits of control word: // 0 Round to 0 -> 11 // 1 Round to nearest -> 00 // 2 Round to +inf -> 10 // 3 Round to -inf -> 01 // The 2-bit value needs then to be shifted so that it occupies bits 11:10. // To make the conversion, put all these values into a value 0xc9 and shift // it left depending on the rounding mode: // (0xc9 << 4) & 0xc00 = X86::rmTowardZero // (0xc9 << 6) & 0xc00 = X86::rmToNearest // ... // (0xc9 << (2 * NewRM + 4)) & 0xc00 SDValue ShiftValue = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, DAG.getNode(ISD::ADD, DL, MVT::i32, DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM, DAG.getConstant(1, DL, MVT::i8)), DAG.getConstant(4, DL, MVT::i32))); SDValue Shifted = DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16), ShiftValue); RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted, DAG.getConstant(0xc00, DL, MVT::i16)); } // Update rounding mode bits and store the new FP Control Word into stack. CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits); Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2)); // Load FP control word from the slot. SDValue OpsLD[] = {Chain, StackSlot}; MachineMemOperand *MMOL = MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2)); Chain = DAG.getMemIntrinsicNode( X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL); // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the // same way but in bits 14:13. if (Subtarget.hasSSE1()) { // Store MXCSR into memory. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), StackSlot); // Load MXCSR from stack slot and clear RM field (bits 14:13). SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI); Chain = CWD.getValue(1); CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0), DAG.getConstant(0xffff9fff, DL, MVT::i32)); // Shift X87 RM bits from 11:10 to 14:13. RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits); RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits, DAG.getConstant(3, DL, MVT::i8)); // Update rounding mode bits and store the new FP Control Word into stack. CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits); Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4)); // Load MXCSR from the slot. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), StackSlot); } return Chain; } const unsigned X87StateSize = 28; const unsigned FPStateSize = 32; [[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8; SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SDLoc DL(Op); SDValue Chain = Op->getOperand(0); SDValue Ptr = Op->getOperand(1); auto *Node = cast(Op); EVT MemVT = Node->getMemoryVT(); assert(MemVT.getSizeInBits() == FPStateSizeInBits); MachineMemOperand *MMO = cast(Op)->getMemOperand(); // Get x87 state, if it presents. if (Subtarget.hasX87()) { Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other), {Chain, Ptr}, MemVT, MMO); // FNSTENV changes the exception mask, so load back the stored environment. MachineMemOperand::Flags NewFlags = MachineMemOperand::MOLoad | (MMO->getFlags() & ~MachineMemOperand::MOStore); MMO = MF.getMachineMemOperand(MMO, NewFlags); Chain = DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other), {Chain, Ptr}, MemVT, MMO); } // If target supports SSE, get MXCSR as well. if (Subtarget.hasSSE1()) { // Get pointer to the MXCSR location in memory. MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(X87StateSize, DL, PtrVT)); // Store MXCSR into memory. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), MXCSRAddr); } return Chain; } static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Set x87 state, if it presents. if (Subtarget.hasX87()) Chain = DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other), {Chain, Ptr}, MemVT, MMO); // If target supports SSE, set MXCSR as well. if (Subtarget.hasSSE1()) { // Get pointer to the MXCSR location in memory. MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(X87StateSize, DL, PtrVT)); // Load MXCSR from memory. Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), MXCSRAddr); } return Chain; } SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Chain = Op->getOperand(0); SDValue Ptr = Op->getOperand(1); auto *Node = cast(Op); EVT MemVT = Node->getMemoryVT(); assert(MemVT.getSizeInBits() == FPStateSizeInBits); MachineMemOperand *MMO = cast(Op)->getMemOperand(); return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget); } SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SDLoc DL(Op); SDValue Chain = Op.getNode()->getOperand(0); IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext()); ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8); SmallVector FPEnvVals; // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise // for compatibility with glibc. unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F; FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW)); Constant *Zero = ConstantInt::get(ItemTy, 0); for (unsigned I = 0; I < 6; ++I) FPEnvVals.push_back(Zero); // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear // all exceptions, sets DAZ and FTZ to 0. FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80)); Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals); MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); MachineMemOperand *MMO = MF.getMachineMemOperand( MPI, MachineMemOperand::MOStore, X87StateSize, Align(4)); return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget); } /// Lower a vector CTLZ using native supported vector CTLZ instruction. // // i8/i16 vector implemented using dword LZCNT vector instruction // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, // split the vector, perform operation on it's Lo a Hi part and // concatenate the results. static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(Op.getOpcode() == ISD::CTLZ); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); unsigned NumElems = VT.getVectorNumElements(); assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"); // Split vector, it's Lo and Hi parts will be handled in next iteration. if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ())) return splitVectorIntUnary(Op, DAG, dl); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && "Unsupported value type for operation"); // Use native supported vector instruction vplzcntd. Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); } // Lower CTLZ using a PSHUFB lookup table implementation. static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); int NumElts = VT.getVectorNumElements(); int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); // Per-nibble leading zero PSHUFB lookup table. const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; SmallVector LUTVec; for (int i = 0; i < NumBytes; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec); // Begin by bitcasting the input to byte vector, then split those bytes // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them. // If the hi input nibble is zero then we add both results together, otherwise // we just take the hi result (by masking the lo result to zero before the // add). SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, CurrVT); SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); SDValue Lo = Op0; SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); SDValue HiZ; if (CurrVT.is512BitVector()) { MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ); HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); } else { HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); } Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); // Merge result back from vXi8 back to VT, working on the lo/hi halves // of the current vector width in the same way we did for the nibbles. // If the upper half of the input element is zero then add the halves' // leading zero counts together, otherwise just use the upper half's. // Double the width of the result until we are at target width. while (CurrVT != VT) { int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); int CurrNumElts = CurrVT.getVectorNumElements(); MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); // Check if the upper half of the input element is zero. if (CurrVT.is512BitVector()) { MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0), DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); } else { HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); } HiZ = DAG.getBitcast(NextVT, HiZ); // Move the upper/lower halves to the lower bits as we'll be extending to // NextVT. Mask the lower result to zero if HiZ is true and add the results // together. SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); CurrVT = NextVT; } return Res; } static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (Subtarget.hasCDI() && // vXi8 vectors need to be promoted to 512-bits for vXi32. (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8)) return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntUnary(Op, DAG, DL); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG, DL); assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); } static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); unsigned Opc = Op.getOpcode(); if (VT.isVector()) return LowerVectorCTLZ(Op, dl, Subtarget, DAG); Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); } // Issue a bsr (scan bits in reverse) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); if (Opc == ISD::CTLZ) { // If src is zero (i.e. bsr sets ZF), returns NumBits. SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1)}; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); } // Finally xor with NumBits-1. Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits - 1, dl, OpVT)); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); return Op; } static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getScalarSizeInBits(); SDValue N0 = Op.getOperand(0); SDLoc dl(Op); assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); // If src is known never zero we can skip the CMOV. if (DAG.isKnownNeverZero(N0)) return Op; // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT), DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), Op.getValue(1)}; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); if (VT == MVT::i16 || VT == MVT::i32) return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget); if (VT == MVT::v32i16 || VT == MVT::v64i8) return splitVectorIntBinary(Op, DAG, DL); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return splitVectorIntBinary(Op, DAG, DL); } static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDValue X = Op.getOperand(0), Y = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); SDLoc DL(Op); if (VT == MVT::v32i16 || VT == MVT::v64i8 || (VT.is256BitVector() && !Subtarget.hasInt256())) { assert(Op.getSimpleValueType().isInteger() && "Only handle AVX vector integer operation"); return splitVectorIntBinary(Op, DAG, DL); } // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); unsigned BitWidth = VT.getScalarSizeInBits(); if (Opcode == ISD::USUBSAT) { if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) { // Handle a special-case with a bit-hack instead of cmp+select: // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1) // If the target can use VPTERNLOG, DAGToDAG will match this as // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a // "broadcast" constant load. ConstantSDNode *C = isConstOrConstSplat(Y, true); if (C && C->getAPIntValue().isSignMask()) { SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT); SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT); SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt); return DAG.getNode(ISD::AND, DL, VT, Xor, Sra); } } if (!TLI.isOperationLegal(ISD::UMAX, VT)) { // usubsat X, Y --> (X >u Y) ? X - Y : 0 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); // TODO: Move this to DAGCombiner? if (SetCCResultType == VT && DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub); return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); } } if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) && (!VT.isVector() || VT == MVT::v2i64)) { APInt MinVal = APInt::getSignedMinValue(BitWidth); APInt MaxVal = APInt::getSignedMaxValue(BitWidth); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue Result = DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL, DAG.getVTList(VT, SetCCResultType), X, Y); SDValue SumDiff = Result.getValue(0); SDValue Overflow = Result.getValue(1); SDValue SatMin = DAG.getConstant(MinVal, DL, VT); SDValue SatMax = DAG.getConstant(MaxVal, DL, VT); SDValue SumNeg = DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT); Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin); return DAG.getSelect(DL, VT, Overflow, Result, SumDiff); } // Use default expansion. return SDValue(); } static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { // Since X86 does not have CMOV for 8-bit integer, we don't convert // 8-bit integer abs to NEG and CMOV. SDValue N0 = Op.getOperand(0); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, VT), N0); SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8), SDValue(Neg.getNode(), 1)}; return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); } // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X). if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) { SDValue Src = Op.getOperand(0); SDValue Neg = DAG.getNegative(Src, DL, VT); return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src); } if (VT.is256BitVector() && !Subtarget.hasInt256()) { assert(VT.isInteger() && "Only handle AVX 256-bit vector integer operation"); return splitVectorIntUnary(Op, DAG, DL); } if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG, DL); // Default to expand. return SDValue(); } static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // For AVX1 cases, split to use legal ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG, DL); if (VT == MVT::v32i16 || VT == MVT::v64i8) return splitVectorIntBinary(Op, DAG, DL); // Default to expand. return SDValue(); } static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // For AVX1 cases, split to use legal ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG, DL); if (VT == MVT::v32i16 || VT == MVT::v64i8) return splitVectorIntBinary(Op, DAG, DL); // Default to expand. return SDValue(); } static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = Op.getValueType(); SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); SDLoc DL(Op); uint64_t SizeInBits = VT.getScalarSizeInBits(); APInt PreferredZero = APInt::getZero(SizeInBits); APInt OppositeZero = PreferredZero; EVT IVT = VT.changeTypeToInteger(); X86ISD::NodeType MinMaxOp; if (Op.getOpcode() == ISD::FMAXIMUM) { MinMaxOp = X86ISD::FMAX; OppositeZero.setSignBit(); } else { PreferredZero.setSignBit(); MinMaxOp = X86ISD::FMIN; } EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // The tables below show the expected result of Max in cases of NaN and // signed zeros. // // Y Y // Num xNaN +0 -0 // --------------- --------------- // Num | Max | Y | +0 | +0 | +0 | // X --------------- X --------------- // xNaN | X | X/Y | -0 | +0 | -0 | // --------------- --------------- // // It is achieved by means of FMAX/FMIN with preliminary checks and operand // reordering. // // We check if any of operands is NaN and return NaN. Then we check if any of // operands is zero or negative zero (for fmaximum and fminimum respectively) // to ensure the correct zero is returned. auto MatchesZero = [](SDValue Op, APInt Zero) { Op = peekThroughBitcasts(Op); if (auto *CstOp = dyn_cast(Op)) return CstOp->getValueAPF().bitcastToAPInt() == Zero; if (auto *CstOp = dyn_cast(Op)) return CstOp->getAPIntValue() == Zero; if (Op->getOpcode() == ISD::BUILD_VECTOR || Op->getOpcode() == ISD::SPLAT_VECTOR) { for (const SDValue &OpVal : Op->op_values()) { if (OpVal.isUndef()) continue; auto *CstOp = dyn_cast(OpVal); if (!CstOp) return false; if (!CstOp->getValueAPF().isZero()) continue; if (CstOp->getValueAPF().bitcastToAPInt() != Zero) return false; } return true; } return false; }; bool IsXNeverNaN = DAG.isKnownNeverNaN(X); bool IsYNeverNaN = DAG.isKnownNeverNaN(Y); bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath || Op->getFlags().hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(X) || DAG.isKnownNeverZeroFloat(Y); SDValue NewX, NewY; if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) || MatchesZero(X, OppositeZero)) { // Operands are already in right order or order does not matter. NewX = X; NewY = Y; } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) { NewX = Y; NewY = X; } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) && (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) { if (IsXNeverNaN) std::swap(X, Y); // VFPCLASSS consumes a vector type. So provide a minimal one corresponded // xmm register. MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits); SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X); // Bits of classes: // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7] // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101, DL, MVT::i32); SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm); SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getConstant(0, DL, MVT::v8i1), IsNanZero, DAG.getIntPtrConstant(0, DL)); SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins); NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X); NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y); return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); } else { SDValue IsXSigned; if (Subtarget.is64Bit() || VT != MVT::f64) { SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X); SDValue ZeroCst = DAG.getConstant(0, DL, IVT); IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT); } else { assert(VT == MVT::f64); SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64, DAG.getConstantFP(0, DL, MVT::v2f64), X, DAG.getIntPtrConstant(0, DL)); SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins); SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX, DAG.getIntPtrConstant(1, DL)); Hi = DAG.getBitcast(MVT::i32, Hi); SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32); EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT); } if (MinMaxOp == X86ISD::FMAX) { NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y); NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X); } else { NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X); NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y); } } bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN); // If we did no ordering operands for signed zero handling and we need // to process NaN and we know that the second operand is not NaN then put // it in first operand and we will not need to post handle NaN after max/min. if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY)) std::swap(NewX, NewY); SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); if (IgnoreNaN || DAG.isKnownNeverNaN(NewX)) return MinMax; SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO); return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); } static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); // For AVX1 cases, split to use legal ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG, dl); if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs()) return splitVectorIntBinary(Op, DAG, dl); bool IsSigned = Op.getOpcode() == ISD::ABDS; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // TODO: Move to TargetLowering expandABD() once we have ABD promotion. if (VT.isScalarInteger()) { unsigned WideBits = std::max(2 * VT.getScalarSizeInBits(), 32u); MVT WideVT = MVT::getIntegerVT(WideBits); if (TLI.isTypeLegal(WideVT)) { // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs)))) // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs)))) unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0)); SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1)); SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS); SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff); return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff); } } // Default to expand. return SDValue(); } static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Decompose 256-bit ops into 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG, dl); if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) return splitVectorIntBinary(Op, DAG, dl); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 // vector pairs, multiply and truncate. if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumElts / NumLanes; if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); return DAG.getNode( ISD::TRUNCATE, dl, VT, DAG.getNode(ISD::MUL, dl, ExVT, DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A), DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B))); } MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // For vXi8 mul, try PMADDUBSW to avoid the need for extension. // Don't do this if we only need to unpack one half. if (Subtarget.hasSSSE3()) { bool BIsBuildVector = isa(B); bool IsLoLaneAllZeroOrUndef = BIsBuildVector; bool IsHiLaneAllZeroOrUndef = BIsBuildVector; if (BIsBuildVector) { for (auto [Idx, Val] : enumerate(B->ops())) { if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2)) IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); else IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); } } if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) { SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT)); SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B); SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B); SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo); SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi); RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask); RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi, DAG.getTargetConstant(8, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi)); } } // Extract the lo/hi parts to any extend to i16. // We're going to mask off the low byte of each result element of the // pmullw, so it doesn't matter what's in the high byte of each 16-bit // element. SDValue Undef = DAG.getUNDEF(VT); SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef)); SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef)); SDValue BLo, BHi; if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { // If the RHS is a constant, manually unpackl/unpackh. SmallVector LoOps, HiOps; for (unsigned i = 0; i != NumElts; i += 16) { for (unsigned j = 0; j != 8; ++j) { LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, MVT::i16)); HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, MVT::i16)); } } BLo = DAG.getBuildVector(ExVT, dl, LoOps); BHi = DAG.getBuildVector(ExVT, dl, HiOps); } else { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); } // Multiply, mask the lower 8bits of the lo/hi results and pack. SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); return getPack(DAG, Subtarget, dl, VT, RLo, RHi); } // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. if (VT == MVT::v4i32) { assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmulld is available!"); // Extract the odd parts. static const int UnpackMask[] = { 1, -1, 3, -1 }; SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); // Multiply the even parts. SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, A), DAG.getBitcast(MVT::v2i64, B)); // Now multiply odd parts. SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Aodds), DAG.getBitcast(MVT::v2i64, Bodds)); Evens = DAG.getBitcast(VT, Evens); Odds = DAG.getBitcast(VT, Odds); // Merge the two vectors back together with a shuffle. This expands into 2 // shuffles. static const int ShufMask[] = { 0, 4, 2, 6 }; return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); } assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"); assert(!Subtarget.hasDQI() && "DQI should use MULLQ"); // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // // AloBlo = pmuludq(a, b); // AloBhi = pmuludq(a, Bhi); // AhiBlo = pmuludq(Ahi, b); // // Hi = psllqi(AloBhi + AhiBlo, 32); // return AloBlo + Hi; KnownBits AKnown = DAG.computeKnownBits(A); KnownBits BKnown = DAG.computeKnownBits(B); APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero); bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero); APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero); bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero); SDValue Zero = DAG.getConstant(0, dl, VT); // Only multiply lo/hi halves that aren't known to be zero. SDValue AloBlo = Zero; if (!ALoIsZero && !BLoIsZero) AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); SDValue AloBhi = Zero; if (!ALoIsZero && !BHiIsZero) { SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); } SDValue AhiBlo = Zero; if (!AHiIsZero && !BLoIsZero) { SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); } SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG); return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi); } static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low = nullptr) { unsigned NumElts = VT.getVectorNumElements(); // For vXi8 we will unpack the low and high half of each 128 bit lane to widen // to a vXi16 type. Do the multiplies, shift the results and pack the half // lane results back together. // We'll take different approaches for signed and unsigned. // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes // and use pmullw to calculate the full 16-bit product. // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and // shift them left into the upper byte of each word. This allows us to use // pmulhw to calculate the full 16-bit product. This trick means we don't // need to sign extend the bytes to use pmullw. MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue ALo, AHi; if (IsSigned) { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A)); AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A)); } else { ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero)); AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero)); } SDValue BLo, BHi; if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { // If the RHS is a constant, manually unpackl/unpackh and extend. SmallVector LoOps, HiOps; for (unsigned i = 0; i != NumElts; i += 16) { for (unsigned j = 0; j != 8; ++j) { SDValue LoOp = B.getOperand(i + j); SDValue HiOp = B.getOperand(i + j + 8); if (IsSigned) { LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16); HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16); LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp, DAG.getConstant(8, dl, MVT::i16)); HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp, DAG.getConstant(8, dl, MVT::i16)); } else { LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16); HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16); } LoOps.push_back(LoOp); HiOps.push_back(HiOp); } } BLo = DAG.getBuildVector(ExVT, dl, LoOps); BHi = DAG.getBuildVector(ExVT, dl, HiOps); } else if (IsSigned) { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B)); } else { BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero)); BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero)); } // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and // pack back to vXi8. unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL; SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo); SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi); if (Low) *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi); return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true); } static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); bool IsSigned = Op->getOpcode() == ISD::MULHS; unsigned NumElts = VT.getVectorNumElements(); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); // Decompose 256-bit ops into 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntBinary(Op, DAG, dl); if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) return splitVectorIntBinary(Op, DAG, dl); if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) { assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. // E.g., PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> // // In other word, to have all the results, we need to perform two PMULxD: // 1. one with the even values. // 2. one with the odd values. // To achieve #2, with need to place the odd values at an even position. // // Place the odd value at an even position (basically, shift all values 1 // step to the left): const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; // => SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts)); // => SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts)); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); unsigned Opcode = (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ; // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, DAG.getBitcast(MulVT, A), DAG.getBitcast(MulVT, B))); // PMULUDQ <4 x i32> , <4 x i32> // => <2 x i64> SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, DAG.getBitcast(MulVT, Odd0), DAG.getBitcast(MulVT, Odd1))); // Shuffle it back into the right order. SmallVector ShufMask(NumElts); for (int i = 0; i != (int)NumElts; ++i) ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask); // If we have a signed multiply but no PMULDQ fix up the result of an // unsigned multiply. if (IsSigned && !Subtarget.hasSSE41()) { SDValue Zero = DAG.getConstant(0, dl, VT); SDValue T1 = DAG.getNode(ISD::AND, dl, VT, DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B); SDValue T2 = DAG.getNode(ISD::AND, dl, VT, DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A); SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup); } return Res; } // Only i8 vectors should need custom lowering after this. assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type"); // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, // logical shift down the upper half and pack back to i8. // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack // and then ashr/lshr the upper bits down to the lower bits before multiply. if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); } return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG); } // Custom lowering for SMULO/UMULO. static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Scalars defer to LowerXALUO. if (!VT.isVector()) return LowerXALUO(Op, DAG); SDLoc dl(Op); bool IsSigned = Op->getOpcode() == ISD::SMULO; SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); EVT OvfVT = Op->getValueType(1); if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) || (VT == MVT::v64i8 && !Subtarget.hasBWI())) { // Extract the LHS Lo/Hi vectors SDValue LHSLo, LHSHi; std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl); // Extract the RHS Lo/Hi vectors SDValue RHSLo, RHSHi; std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl); EVT LoOvfVT, HiOvfVT; std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT); SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT); SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT); // Issue the split operations. SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo); SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi); // Join the separate data results and the overflow results. SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1), Hi.getValue(1)); return DAG.getMergeValues({Res, Ovf}, dl); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { unsigned NumElts = VT.getVectorNumElements(); MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); SDValue Ovf; if (IsSigned) { SDValue High, LowSign; if (OvfVT.getVectorElementType() == MVT::i1 && (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) { // Rather the truncating try to do the compare on vXi16 or vXi32. // Shift the high down filling with sign bits. High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG); // Fill all 16 bits with the sign bit from the low. LowSign = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG); LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign, 15, DAG); SetccVT = OvfVT; if (!Subtarget.hasBWI()) { // We can't do a vXi16 compare so sign extend to v16i32. High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High); LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign); } } else { // Otherwise do the compare at vXi8. High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); High = DAG.getNode(ISD::TRUNCATE, dl, VT, High); LowSign = DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT)); } Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE); } else { SDValue High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); if (OvfVT.getVectorElementType() == MVT::i1 && (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) { // Rather the truncating try to do the compare on vXi16 or vXi32. SetccVT = OvfVT; if (!Subtarget.hasBWI()) { // We can't do a vXi16 compare so sign extend to v16i32. High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High); } } else { // Otherwise do the compare at vXi8. High = DAG.getNode(ISD::TRUNCATE, dl, VT, High); } Ovf = DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE); } Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT); return DAG.getMergeValues({Low, Ovf}, dl); } SDValue Low; SDValue High = LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low); SDValue Ovf; if (IsSigned) { // SMULO overflows if the high bits don't match the sign of the low. SDValue LowSign = DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT)); Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE); } else { // UMULO overflows if the high bits are non-zero. Ovf = DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE); } Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT); return DAG.getMergeValues({Low, Ovf}, dl); } SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); if (isa(Op->getOperand(1))) { SmallVector Result; if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG)) return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]); } RTLIB::Libcall LC; bool isSigned; switch (Op->getOpcode()) { // clang-format off default: llvm_unreachable("Unexpected request for libcall!"); case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; // clang-format on } SDLoc dl(Op); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { EVT ArgVT = Op->getOperand(i).getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); } SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(InChain) .setLibCallee( getLibcallCallingConv(LC), static_cast(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, std::move(Args)) .setInRegister() .setSExtResult(isSigned) .setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); } SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG, SDValue &Chain) const { assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); bool IsStrict = Op->isStrictFPOpcode(); SDValue Arg = Op.getOperand(IsStrict ? 1 : 0); EVT ArgVT = Arg.getValueType(); assert(VT.isInteger() && VT.getSizeInBits() == 128 && "Unexpected return type for lowering"); RTLIB::Libcall LC; if (Op->getOpcode() == ISD::FP_TO_SINT || Op->getOpcode() == ISD::STRICT_FP_TO_SINT) LC = RTLIB::getFPTOSINT(ArgVT, VT); else LC = RTLIB::getFPTOUINT(ArgVT, VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!"); SDLoc dl(Op); MakeLibCallOptions CallOptions; Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); SDValue Result; // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the // expected VT (i128). std::tie(Result, Chain) = makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain); Result = DAG.getBitcast(VT, Result); return Result; } SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget.isTargetWin64() && "Unexpected target"); EVT VT = Op.getValueType(); bool IsStrict = Op->isStrictFPOpcode(); SDValue Arg = Op.getOperand(IsStrict ? 1 : 0); EVT ArgVT = Arg.getValueType(); assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); RTLIB::Libcall LC; if (Op->getOpcode() == ISD::SINT_TO_FP || Op->getOpcode() == ISD::STRICT_SINT_TO_FP) LC = RTLIB::getSINTTOFP(ArgVT, VT); else LC = RTLIB::getUINTTOFP(ArgVT, VT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!"); SDLoc dl(Op); MakeLibCallOptions CallOptions; SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); // Pass the i128 argument as an indirect argument on the stack. SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16)); SDValue Result; std::tie(Result, Chain) = makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain); return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; } // Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate. uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) { assert((Amt < 8) && "Shift/Rotation amount out of range"); switch (Opcode) { case ISD::BITREVERSE: return 0x8040201008040201ULL; case ISD::SHL: return ((0x0102040810204080ULL >> (Amt)) & (0x0101010101010101ULL * (0xFF >> (Amt)))); case ISD::SRL: return ((0x0102040810204080ULL << (Amt)) & (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF))); case ISD::SRA: return (getGFNICtrlImm(ISD::SRL, Amt) | (0x8080808080808080ULL >> (64 - (8 * Amt)))); case ISD::ROTL: return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt); case ISD::ROTR: return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt); } llvm_unreachable("Unsupported GFNI opcode"); } // Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate. SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt = 0) { assert(VT.getVectorElementType() == MVT::i8 && (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type"); uint64_t Imm = getGFNICtrlImm(Opcode, Amt); SmallVector MaskBits; for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) { uint64_t Bits = (Imm >> (I % 64)) & 255; MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8)); } return DAG.getBuildVector(VT, DL, MaskBits); } // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) && "Unexpected shift opcode"); if (!VT.isSimple()) return false; if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) return false; if (VT.getScalarSizeInBits() < 16) return false; if (VT.is512BitVector() && Subtarget.useAVX512Regs() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || (VT.is256BitVector() && Subtarget.hasInt256()); bool AShift = LShift && (Subtarget.hasAVX512() || (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } // The shift amount is a variable, but it is the same for all vector lanes. // These instructions are defined together with shift-immediate. static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { return supportedVectorShiftWithImm(VT, Subtarget, Opcode); } // Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) && "Unexpected shift opcode"); if (!VT.isSimple()) return false; if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) return false; if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) return false; // vXi16 supported only on AVX-512, BWI if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; if (Subtarget.hasAVX512() && (Subtarget.useAVX512Regs() || !VT.is512BitVector())) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; return (Opcode == ISD::SRA) ? AShift : LShift; } static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false); unsigned EltSizeInBits = VT.getScalarSizeInBits(); auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"); MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); // ashr(R, 63) === cmp_slt(R, 0) if (ShiftAmt == 63 && Subtarget.hasSSE42()) { assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"); return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R); } if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt - 32, DAG); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {9, 1, 11, 3, 13, 5, 15, 7}); } else { // SRA upper i32, SRL whole i64 and select lower i32. SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, ShiftAmt, DAG); SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); Lower = DAG.getBitcast(ExVT, Lower); if (VT == MVT::v2i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); if (VT == MVT::v4i64) Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {8, 1, 10, 3, 12, 5, 14, 7}); } return DAG.getBitcast(VT, Ex); }; // Optimize shl/srl/sra with constant shift amount. APInt APIntShiftAmt; if (!X86::isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); // If the shift amount is out of range, return undef. if (APIntShiftAmt.uge(EltSizeInBits)) return DAG.getUNDEF(VT); uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { // Hardware support for vector shifts is sparse which makes us scalarize the // vector operations in many cases. Also, on sandybridge ADD is faster than // shl: (shl V, 1) -> (add (freeze V), (freeze V)) if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { // R may be undef at run-time, but (shl R, 1) must be an even number (LSB // must be 0). (add undef, undef) however can be any value. To make this // safe, we must freeze R to ensure that register allocation uses the same // register for an undefined value. This ensures that the result will // still be even and preserves the original semantics. R = DAG.getFreeze(R); return DAG.getNode(ISD::ADD, dl, VT, R, R); } return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || (Subtarget.hasInt256() && VT == MVT::v4i64)) && Op.getOpcode() == ISD::SRA) return ArithmeticShiftRight64(ShiftAmt); // If we're logical shifting an all-signbits value then we can just perform as // a mask. if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) && DAG.ComputeNumSignBits(R) == EltSizeInBits) { SDValue Mask = DAG.getAllOnesConstant(dl, VT); Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt); return DAG.getNode(ISD::AND, dl, VT, R, Mask); } if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || (Subtarget.hasBWI() && VT == MVT::v64i8)) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); // Simple i8 add case if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { // R may be undef at run-time, but (shl R, 1) must be an even number (LSB // must be 0). (add undef, undef) however can be any value. To make this // safe, we must freeze R to ensure that register allocation uses the same // register for an undefined value. This ensures that the result will // still be even and preserves the original semantics. R = DAG.getFreeze(R); return DAG.getNode(ISD::ADD, dl, VT, R, R); } // ashr(R, 7) === cmp_slt(R, 0) if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { SDValue Zeros = DAG.getConstant(0, dl, VT); if (VT.is512BitVector()) { assert(VT == MVT::v64i8 && "Unexpected element type!"); SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT); return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); } return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); } // XOP can shift v16i8 directly instead of as shift v8i16 + mask. if (VT == MVT::v16i8 && Subtarget.hasXOP()) return SDValue(); if (Subtarget.hasGFNI()) { SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt); return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask, DAG.getTargetConstant(0, dl, MVT::i8)); } if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt); return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt); return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; } llvm_unreachable("Unknown shift opcode."); } return SDValue(); } static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); int BaseShAmtIdx = -1; if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) { if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx, Subtarget, DAG); // vXi8 shifts - shift as v8i16 + mask result. if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) || (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) || VT == MVT::v64i8) && !Subtarget.hasXOP()) { unsigned NumElts = VT.getVectorNumElements(); MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2); if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) { unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL); unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false); // Create the mask using vXi16 shifts. For shift-rights we need to move // the upper byte down before splatting the vXi8 mask. SDValue BitMask = DAG.getConstant(-1, dl, ExtVT); BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask, BaseShAmt, BaseShAmtIdx, Subtarget, DAG); if (Opcode != ISD::SHL) BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask, 8, DAG); BitMask = DAG.getBitcast(VT, BitMask); BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask, SmallVector(NumElts, 0)); SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, DAG.getBitcast(ExtVT, R), BaseShAmt, BaseShAmtIdx, Subtarget, DAG); Res = DAG.getBitcast(VT, Res); Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask); if (Opcode == ISD::SRA) { // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask) // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW. SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT); SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt, BaseShAmtIdx, Subtarget, DAG); SignMask = DAG.getBitcast(VT, SignMask); Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask); } return Res; } } } return SDValue(); } // Convert a shift/rotate left amount to a multiplication scale factor. static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Amt.getSimpleValueType(); if (!(VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget.hasInt256() && VT == MVT::v16i16) || (Subtarget.hasAVX512() && VT == MVT::v32i16) || (!Subtarget.hasAVX512() && VT == MVT::v16i8) || (Subtarget.hasInt256() && VT == MVT::v32i8) || (Subtarget.hasBWI() && VT == MVT::v64i8))) return SDValue(); MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); unsigned NumElems = VT.getVectorNumElements(); APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) { APInt One(SVTBits, 1); SmallVector Elts(NumElems, DAG.getUNDEF(SVT)); for (unsigned I = 0; I != NumElems; ++I) { if (UndefElts[I] || EltBits[I].uge(SVTBits)) continue; uint64_t ShAmt = EltBits[I].getZExtValue(); Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT); } return DAG.getBuildVector(VT, dl, Elts); } // If the target doesn't support variable shifts, use either FP conversion // or integer multiplication to avoid shifting each element individually. if (VT == MVT::v4i32) { Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, DAG.getConstant(0x3f800000U, dl, VT)); Amt = DAG.getBitcast(MVT::v4f32, Amt); return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt); } // AVX2 can more effectively perform this as a zext/trunc to/from v8i32. if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) { SDValue Z = DAG.getConstant(0, dl, VT); SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z)); SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z)); Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG); Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG); if (Subtarget.hasSSE41()) return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); return getPack(DAG, Subtarget, dl, VT, Lo, Hi); } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); unsigned Opc = Op.getOpcode(); unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true); unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false); assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"); if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget)) return V; if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget)) return V; if (supportedVectorVarShift(VT, Subtarget, Opc)) return Op; // i64 vector arithmetic shift can be emulated with the transform: // M = lshr(SIGN_MASK, Amt) // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) || (VT == MVT::v4i64 && Subtarget.hasInt256())) && Opc == ISD::SRA) { SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT); SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); R = DAG.getNode(ISD::XOR, dl, VT, R, M); R = DAG.getNode(ISD::SUB, dl, VT, R, M); return R; } // XOP has 128-bit variable logical/arithmetic shifts. // +ve/-ve Amt = shift left/right. if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) { if (Opc == ISD::SRL || Opc == ISD::SRA) Amt = DAG.getNegative(Amt, dl, VT); if (Opc == ISD::SHL || Opc == ISD::SRL) return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); if (Opc == ISD::SRA) return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); } // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Opc != ISD::SRA) { // Splat the shift amounts so the scalar shifts above will catch it. SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } // If possible, lower this shift as a sequence of two shifts by // constant plus a BLENDing shuffle instead of scalarizing it. // Example: // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) // // Could be rewritten as: // (v4i32 (MOVSS (srl A, ), (srl A, ))) // // The advantage is that the two shifts from the example would be // lowered as X86ISD::VSRLI nodes in parallel before blending. if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) { SDValue Amt1, Amt2; unsigned NumElts = VT.getVectorNumElements(); SmallVector ShuffleMask; for (unsigned i = 0; i != NumElts; ++i) { SDValue A = Amt->getOperand(i); if (A.isUndef()) { ShuffleMask.push_back(SM_SentinelUndef); continue; } if (!Amt1 || Amt1 == A) { ShuffleMask.push_back(i); Amt1 = A; continue; } if (!Amt2 || Amt2 == A) { ShuffleMask.push_back(i + NumElts); Amt2 = A; continue; } break; } // Only perform this blend if we can perform it without loading a mask. if (ShuffleMask.size() == NumElts && Amt1 && Amt2 && (VT != MVT::v16i16 || is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) && (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL || canWidenShuffleElements(ShuffleMask))) { auto *Cst1 = dyn_cast(Amt1); auto *Cst2 = dyn_cast(Amt2); if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) && Cst2->getAPIntValue().ult(EltSizeInBits)) { SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, Cst1->getZExtValue(), DAG); SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, Cst2->getZExtValue(), DAG); return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); } } } // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts. if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() || Subtarget.canExtendTo512BW()))) if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) return DAG.getNode(ISD::MUL, dl, VT, R, Scale); // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt). if (Opc == ISD::SRL && ConstantAmt && (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) { SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT); SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { SDValue Zero = DAG.getConstant(0, dl, VT); SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ); SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale); return DAG.getSelect(dl, VT, ZAmt, R, Res); } } // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt). // TODO: Special case handling for shift by 0/1, really we can afford either // of these cases in pre-SSE41/XOP/AVX512 but not both. if (Opc == ISD::SRA && ConstantAmt && (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) && ((Subtarget.hasSSE41() && !Subtarget.hasXOP() && !Subtarget.hasAVX512()) || DAG.isKnownNeverZero(Amt))) { SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT); SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { SDValue Amt0 = DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ); SDValue Amt1 = DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ); SDValue Sra1 = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG); SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale); Res = DAG.getSelect(dl, VT, Amt0, R, Res); return DAG.getSelect(dl, VT, Amt1, Sra1, Res); } } // v4i32 Non Uniform Shifts. // If the shift amount is constant we can shift each lane using the SSE2 // immediate shifts, else we need to zero-extend each lane to the lower i64 // and shift using the SSE2 variable shifts. // The separate results can then be blended together. if (VT == MVT::v4i32) { SDValue Amt0, Amt1, Amt2, Amt3; if (ConstantAmt) { Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); } else { // The SSE2 shifts use the lower i64 as the same shift amount for // all lanes and the upper i64 is ignored. On AVX we're better off // just zero-extending, but for SSE just duplicating the top 16-bits is // cheaper and has the same effect for out of range values. if (Subtarget.hasAVX()) { SDValue Z = DAG.getConstant(0, dl, VT); Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); } else { SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt); SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, {4, 5, 6, 7, -1, -1, -1, -1}); SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG); SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG); Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02); Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13); Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02); Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13); } } unsigned ShOpc = ConstantAmt ? Opc : X86OpcV; SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0)); SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1)); SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2)); SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3)); // Merge the shifted lane results optimally with/without PBLENDW. // TODO - ideally shuffle combining would handle this. if (Subtarget.hasSSE41()) { SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5}); SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7}); return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7}); } // It's worth extending once and using the vXi16/vXi32 shifts for smaller // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 // make the existing SSE solution better. // NOTE: We honor prefered vector width before promoting to 512-bits. if ((Subtarget.hasInt256() && VT == MVT::v8i16) || (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) || (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) || (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) || (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) { assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && "Unexpected vector type"); MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32; MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, ExtVT, R); Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Opc, dl, ExtVT, R, Amt)); } // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); // Extend constant shift amount to vXi16 (it doesn't matter if the type // isn't legal). MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT); Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt); Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt); assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && "Constant build vector expected"); if (VT == MVT::v16i8 && Subtarget.hasInt256()) { bool IsSigned = Opc == ISD::SRA; R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT); R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt); R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8); return DAG.getZExtOrTrunc(R, dl, VT); } SmallVector LoAmt, HiAmt; for (int i = 0; i != NumElts; i += 16) { for (int j = 0; j != 8; ++j) { LoAmt.push_back(Amt.getOperand(i + j)); HiAmt.push_back(Amt.getOperand(i + j + 8)); } } MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt); SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt); SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R)); SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R)); LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8); HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8); LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA); HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA); LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8); HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8); return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (VT.is512BitVector()) { // On AVX512BW targets we make use of the fact that VSELECT lowers // to a masked blend which selects bytes based just on the sign bit // extracted to a mask. MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel, ISD::SETGT); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we can use PBLENDVB which selects bytes based just // on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); return DAG.getBitcast(SelVT, DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = DAG.getConstant(0, dl, SelVT); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. Amt = DAG.getBitcast(ExtVT, Amt); Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG); Amt = DAG.getBitcast(VT, Amt); if (Opc == ISD::SHL || Opc == ISD::SRL) { // r = VSELECT(r, shift(r, 4), a); SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT)); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT)); R = SignBitSelect(VT, Amt, M, R); return R; } if (Opc == ISD::SRA) { // For SRA we need to unpack each byte to the higher byte of a i16 vector // so we can correctly sign extend. We don't care what happens to the // lower byte. SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt); SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt); SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R); SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); // r = VSELECT(r, shift(r, 4), a); SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG); SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 2), a); MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG); MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // a += a ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); // r = VSELECT(r, shift(r, 1), a); MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG); MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG); RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); // Logical shift the result back to the lower byte, leaving a zero upper // byte meaning that we can safely pack with PACKUSWB. RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG); RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG); return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); } } if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = DAG.getConstant(0, dl, VT); SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z); SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z); SDValue RLo = getUnpackl(DAG, dl, VT, Z, R); SDValue RHi = getUnpackh(DAG, dl, VT, Z, R); ALo = DAG.getBitcast(ExtVT, ALo); AHi = DAG.getBitcast(ExtVT, AHi); RLo = DAG.getBitcast(ExtVT, RLo); RHi = DAG.getBitcast(ExtVT, RHi); SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo); SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi); Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG); return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } if (VT == MVT::v8i16) { // If we have a constant shift amount, the non-SSE41 path is best as // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW. bool UseSSE41 = Subtarget.hasSSE41() && !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { // On SSE41 targets we can use PBLENDVB which selects bytes based just on // the sign bit. if (UseSSE41) { MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG); return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; if (UseSSE41) { // On SSE41 targets we need to replicate the shift mask in both // bytes for PBLENDVB. Amt = DAG.getNode( ISD::OR, dl, VT, getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG), getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG)); } else { Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG); } // r = VSELECT(r, shift(r, 8), a); SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 4), a); M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // r = VSELECT(r, shift(r, 2), a); M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG); R = SignBitSelect(Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); // return VSELECT(r, shift(r, 1), a); M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG); R = SignBitSelect(Amt, M, R); return R; } // Decompose 256-bit shifts into 128-bit shifts. if (VT.is256BitVector()) return splitVectorIntBinary(Op, DAG, dl); if (VT == MVT::v32i16 || VT == MVT::v64i8) return splitVectorIntBinary(Op, DAG, dl); return SDValue(); } static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"); SDLoc DL(Op); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue Amt = Op.getOperand(2); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool IsFSHR = Op.getOpcode() == ISD::FSHR; if (VT.isVector()) { APInt APIntShiftAmt; bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { if (IsFSHR) std::swap(Op0, Op1); if (IsCstSplat) { uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, {Op0, Op1, Imm}, DAG, Subtarget); } return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, {Op0, Op1, Amt}, DAG, Subtarget); } assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && "Unexpected funnel shift type!"); // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw. // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))). if (IsCstSplat) { // TODO: Can't use generic expansion as UNDEF amt elements can be // converted to other values when folded to shift amounts, losing the // splat. uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt; uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt); assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift"); MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2); if (EltSizeInBits == 8 && (Subtarget.hasXOP() || (useVPTERNLOG(Subtarget, VT) && supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) { // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG // bit-select - lower using vXi16 shifts and then perform the bitmask at // the original vector width to handle cases where we split. APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt); APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt); SDValue ShX = DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0), DAG.getShiftAmountConstant(ShXAmt, WideVT, DL)); SDValue ShY = DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1), DAG.getShiftAmountConstant(ShYAmt, WideVT, DL)); ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX), DAG.getConstant(MaskX, DL, VT)); ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY), DAG.getConstant(MaskY, DL, VT)); return DAG.getNode(ISD::OR, DL, VT, ShX, ShY); } SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0, DAG.getShiftAmountConstant(ShXAmt, VT, DL)); SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1, DAG.getShiftAmountConstant(ShYAmt, VT, DL)); return DAG.getNode(ISD::OR, DL, VT, ShX, ShY); } SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode()); // Constant vXi16 funnel shifts can be efficiently handled by default. if (IsCst && EltSizeInBits == 16) return SDValue(); unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL; MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); // Split 256-bit integers on XOP/pre-AVX2 targets. // Split 512-bit integers on non 512-bit BWI targets. if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) || !Subtarget.hasAVX2())) || (VT.is512BitVector() && !Subtarget.useBWIRegs() && EltSizeInBits < 32)) { // Pre-mask the amount modulo using the wider vector. Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod); return splitVectorOp(Op, DAG, DL); } // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z)) if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) { int ScalarAmtIdx = -1; if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) { // Uniform vXi16 funnel shifts can be efficiently handled by default. if (EltSizeInBits == 16) return SDValue(); SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, ScalarAmtIdx, Subtarget, DAG); Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, ScalarAmtIdx, Subtarget, DAG); return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); } } MVT WideSVT = MVT::getIntegerVT( std::min(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32)); MVT WideVT = MVT::getVectorVT(WideSVT, NumElts); // If per-element shifts are legal, fallback to generic expansion. if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP()) return SDValue(); // Attempt to fold as: // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) && supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) { Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0); Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1); AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod); Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0, EltSizeInBits, DAG); SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1); Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod); if (!IsFSHR) Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res, EltSizeInBits, DAG); return DAG.getNode(ISD::TRUNCATE, DL, VT, Res); } // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z) if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) { SDValue Z = DAG.getConstant(0, DL, VT); SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z)); SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z)); SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo); SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi); return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); } // Fallback to generic expansion. return SDValue(); } assert( (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow(); // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) && !isa(Amt)) { SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType()); SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType()); Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32); Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32); Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask); SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift); Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1); if (IsFSHR) { Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt); } else { Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt); Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift); } return DAG.getZExtOrTrunc(Res, DL, VT); } if (VT == MVT::i8 || ExpandFunnel) return SDValue(); // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. if (VT == MVT::i16) { Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, DAG.getConstant(15, DL, Amt.getValueType())); unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL); return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt); } return Op; } static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert(VT.isVector() && "Custom lowering only for vector rotates!"); SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); unsigned Opcode = Op.getOpcode(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); int NumElts = VT.getVectorNumElements(); bool IsROTL = Opcode == ISD::ROTL; // Check for constant splat rotation amount. APInt CstSplatValue; bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue); // Check for splat rotate by zero. if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0) return R; // AVX512 implicitly uses modulo rotation amounts. if ((Subtarget.hasVLX() || (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) && 32 <= EltSizeInBits) { // Attempt to rotate by immediate. if (IsCstSplat) { unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI; uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); return DAG.getNode(RotOpc, DL, VT, R, DAG.getTargetConstant(RotAmt, DL, MVT::i8)); } // Else, fall-back on VPROLV/VPRORV. return Op; } // AVX512 VBMI2 vXi16 - lower to funnel shifts. if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) { unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR; return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); } SDValue Z = DAG.getConstant(0, DL, VT); if (!IsROTL) { // If the ISD::ROTR amount is constant, we're always better converting to // ISD::ROTL. if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt})) return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt); // XOP targets always prefers ISD::ROTL. if (Subtarget.hasXOP()) return DAG.getNode(ISD::ROTL, DL, VT, R, DAG.getNode(ISD::SUB, DL, VT, Z, Amt)); } // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant. if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt); return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask, DAG.getTargetConstant(0, DL, MVT::i8)); } // Split 256-bit integers on XOP/pre-AVX2 targets. if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2())) return splitVectorIntBinary(Op, DAG, DL); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. // XOP implicitly uses modulo rotation amounts. if (Subtarget.hasXOP()) { assert(IsROTL && "Only ROTL expected"); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. if (IsCstSplat) { uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); return DAG.getNode(X86ISD::VROTLI, DL, VT, R, DAG.getTargetConstant(RotAmt, DL, MVT::i8)); } // Use general rotate by variable (per-element). return Op; } // Rotate by an uniform constant - expand back to shifts. // TODO: Can't use generic expansion as UNDEF amt elements can be converted // to other values when folded to shift amounts, losing the splat. if (IsCstSplat) { uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt); uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt; SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R, DAG.getShiftAmountConstant(ShlAmt, VT, DL)); SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R, DAG.getShiftAmountConstant(SrlAmt, VT, DL)); return DAG.getNode(ISD::OR, DL, VT, Shl, Srl); } // Split 512-bit integers on non 512-bit BWI targets. if (VT.is512BitVector() && !Subtarget.useBWIRegs()) return splitVectorIntBinary(Op, DAG, DL); assert( (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && "Only vXi32/vXi16/vXi8 vector rotates supported"); MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); // Attempt to fold as unpack(x,x) << zext(splat(y)): // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) { int BaseRotAmtIdx = -1; if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) { if (EltSizeInBits == 16 && Subtarget.hasSSE41()) { unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR; return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); } unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI; SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt, BaseRotAmtIdx, Subtarget, DAG); Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt, BaseRotAmtIdx, Subtarget, DAG); return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); } } bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL; // Attempt to fold as unpack(x,x) << zext(y): // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). // Const vXi16/vXi32 are excluded in favor of MUL-based lowering. if (!(ConstantAmt && EltSizeInBits != 8) && !supportedVectorVarShift(VT, Subtarget, ShiftOpc) && (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) { SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z)); SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z)); SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo); SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi); return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); } // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by // the amount bit. // TODO: We're doing nothing here that we couldn't do for funnel shifts. if (EltSizeInBits == 8) { MVT WideVT = MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts); // Attempt to fold as: // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw. // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))). if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) && supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) { // If we're rotating by constant, just use default promotion. if (ConstantAmt) return SDValue(); // See if we can perform this by widening to vXi16 or vXi32. R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R); R = DAG.getNode( ISD::OR, DL, WideVT, R, getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG)); Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod); R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt); if (IsROTL) R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG); return DAG.getNode(ISD::TRUNCATE, DL, VT, R); } // We don't need ModuloAmt here as we just peek at individual bits. auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (Subtarget.hasSSE41()) { // On SSE41 targets we can use PBLENDVB which selects bytes based just // on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); return DAG.getBitcast(SelVT, DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = DAG.getConstant(0, DL, SelVT); SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel); return DAG.getSelect(DL, SelVT, C, V0, V1); }; // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG. if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) { Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt); IsROTL = true; } unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL; unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. Amt = DAG.getBitcast(ExtVT, Amt); Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT)); Amt = DAG.getBitcast(VT, Amt); // r = VSELECT(r, rot(r, 4), a); SDValue M; M = DAG.getNode( ISD::OR, DL, VT, DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)), DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT))); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt); // r = VSELECT(r, rot(r, 2), a); M = DAG.getNode( ISD::OR, DL, VT, DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)), DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT))); R = SignBitSelect(VT, Amt, M, R); // a += a Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt); // return VSELECT(r, rot(r, 1), a); M = DAG.getNode( ISD::OR, DL, VT, DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)), DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT))); return SignBitSelect(VT, Amt, M, R); } bool IsSplatAmt = DAG.isSplatValue(Amt); bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) && supportedVectorVarShift(VT, Subtarget, ISD::SRL); // Fallback for splats + all supported variable shifts. // Fallback for non-constants AVX2 vXi16 as well. if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) { Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt); SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR); return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); } // Everything below assumes ISD::ROTL. if (!IsROTL) { Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt); IsROTL = true; } // ISD::ROT* uses modulo rotate amounts. Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); assert(IsROTL && "Only ROTL supported"); // As with shifts, attempt to convert the rotation amount to a multiplication // factor, fallback to general expansion. SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG); if (!Scale) return SDValue(); // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results. if (EltSizeInBits == 16) { SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale); SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale); return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits // that can then be OR'd with the lower 32-bits. assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected"); static const int OddMask[] = {1, -1, 3, -1}; SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask); SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask); SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, R), DAG.getBitcast(MVT::v2i64, Scale)); SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, R13), DAG.getBitcast(MVT::v2i64, Scale13)); Res02 = DAG.getBitcast(VT, Res02); Res13 = DAG.getBitcast(VT, Res13); return DAG.getNode(ISD::OR, DL, VT, DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}), DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7})); } /// Returns true if the operand type is exactly twice the native width, and /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit(); if (OpWidth == 128) return Subtarget.canUseCMPXCHG16B(); return false; } TargetLoweringBase::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && !Subtarget.useSoftFloat()) { if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && (Subtarget.hasSSE1() || Subtarget.hasX87())) return AtomicExpansionKind::None; if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && Subtarget.hasAVX()) return AtomicExpansionKind::None; } return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand : AtomicExpansionKind::None; } // Note: this turns large loads into lock cmpxchg8b/16b. TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { Type *MemType = LI->getType(); if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) && !Subtarget.useSoftFloat()) { // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we // can use movq to do the load. If we have X87 we can load into an 80-bit // X87 register and store it to a stack temporary. if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && (Subtarget.hasSSE1() || Subtarget.hasX87())) return AtomicExpansionKind::None; // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic. if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() && Subtarget.hasAVX()) return AtomicExpansionKind::None; } return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } enum BitTestKind : unsigned { UndefBit, ConstantBit, NotConstantBit, ShiftBit, NotShiftBit }; static std::pair FindSingleBitChange(Value *V) { using namespace llvm::PatternMatch; BitTestKind BTK = UndefBit; auto *C = dyn_cast(V); if (C) { // Check if V is a power of 2 or NOT power of 2. if (isPowerOf2_64(C->getZExtValue())) BTK = ConstantBit; else if (isPowerOf2_64((~C->getValue()).getZExtValue())) BTK = NotConstantBit; return {V, BTK}; } // Check if V is some power of 2 pattern known to be non-zero auto *I = dyn_cast(V); if (I) { bool Not = false; // Check if we have a NOT Value *PeekI; if (match(I, m_Not(m_Value(PeekI))) || match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) { Not = true; I = dyn_cast(PeekI); // If I is constant, it will fold and we can evaluate later. If its an // argument or something of that nature, we can't analyze. if (I == nullptr) return {nullptr, UndefBit}; } // We can only use 1 << X without more sophisticated analysis. C << X where // C is a power of 2 but not 1 can result in zero which cannot be translated // to bittest. Likewise any C >> X (either arith or logical) can be zero. if (I->getOpcode() == Instruction::Shl) { // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X & // -X` and some other provable power of 2 patterns that we can use CTZ on // may be profitable. // Todo(2): It may be possible in some cases to prove that Shl(C, X) is // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also // be provably a non-zero power of 2. // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be // transformable to bittest. auto *ShiftVal = dyn_cast(I->getOperand(0)); if (!ShiftVal) return {nullptr, UndefBit}; if (ShiftVal->equalsInt(1)) BTK = Not ? NotShiftBit : ShiftBit; if (BTK == UndefBit) return {nullptr, UndefBit}; Value *BitV = I->getOperand(1); Value *AndOp; const APInt *AndC; if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) { // Read past a shiftmask instruction to find count if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1)) BitV = AndOp; } return {BitV, BTK}; } } return {nullptr, UndefBit}; } TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const { using namespace llvm::PatternMatch; // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. if (AI->use_empty()) return AtomicExpansionKind::None; if (AI->getOperation() == AtomicRMWInst::Xor) { // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is // preferable to both `cmpxchg` and `btc`. if (match(AI->getOperand(1), m_SignMask())) return AtomicExpansionKind::None; } // If the atomicrmw's result is used by a single bit AND, we may use // bts/btr/btc instruction for these operations. // Note: InstCombinePass can cause a de-optimization here. It replaces the // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor // (depending on CC). This pattern can only use bts/btr/btc but we don't // detect it. Instruction *I = AI->user_back(); auto BitChange = FindSingleBitChange(AI->getValOperand()); if (BitChange.second == UndefBit || !AI->hasOneUse() || I->getOpcode() != Instruction::And || AI->getType()->getPrimitiveSizeInBits() == 8 || AI->getParent() != I->getParent()) return AtomicExpansionKind::CmpXChg; unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0; // This is a redundant AND, it should get cleaned up elsewhere. if (AI == I->getOperand(OtherIdx)) return AtomicExpansionKind::CmpXChg; // The following instruction must be a AND single bit. if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) { auto *C1 = cast(AI->getValOperand()); auto *C2 = dyn_cast(I->getOperand(OtherIdx)); if (!C2 || !isPowerOf2_64(C2->getZExtValue())) { return AtomicExpansionKind::CmpXChg; } if (AI->getOperation() == AtomicRMWInst::And) { return ~C1->getValue() == C2->getValue() ? AtomicExpansionKind::BitTestIntrinsic : AtomicExpansionKind::CmpXChg; } return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic : AtomicExpansionKind::CmpXChg; } assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit); auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx)); if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit) return AtomicExpansionKind::CmpXChg; assert(BitChange.first != nullptr && BitTested.first != nullptr); // If shift amounts are not the same we can't use BitTestIntrinsic. if (BitChange.first != BitTested.first) return AtomicExpansionKind::CmpXChg; // If atomic AND need to be masking all be one bit and testing the one bit // unset in the mask. if (AI->getOperation() == AtomicRMWInst::And) return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit) ? AtomicExpansionKind::BitTestIntrinsic : AtomicExpansionKind::CmpXChg; // If atomic XOR/OR need to be setting and testing the same bit. return (BitChange.second == ShiftBit && BitTested.second == ShiftBit) ? AtomicExpansionKind::BitTestIntrinsic : AtomicExpansionKind::CmpXChg; } void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections}); Intrinsic::ID IID_C = Intrinsic::not_intrinsic; Intrinsic::ID IID_I = Intrinsic::not_intrinsic; switch (AI->getOperation()) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Or: IID_C = Intrinsic::x86_atomic_bts; IID_I = Intrinsic::x86_atomic_bts_rm; break; case AtomicRMWInst::Xor: IID_C = Intrinsic::x86_atomic_btc; IID_I = Intrinsic::x86_atomic_btc_rm; break; case AtomicRMWInst::And: IID_C = Intrinsic::x86_atomic_btr; IID_I = Intrinsic::x86_atomic_btr_rm; break; } Instruction *I = AI->user_back(); LLVMContext &Ctx = AI->getContext(); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), PointerType::getUnqual(Ctx)); Function *BitTest = nullptr; Value *Result = nullptr; auto BitTested = FindSingleBitChange(AI->getValOperand()); assert(BitTested.first != nullptr); if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) { auto *C = cast(I->getOperand(I->getOperand(0) == AI ? 1 : 0)); BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType()); unsigned Imm = llvm::countr_zero(C->getZExtValue()); Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); } else { BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType()); assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit); Value *SI = BitTested.first; assert(SI != nullptr); // BT{S|R|C} on memory operand don't modulo bit position so we need to // mask it. unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits(); Value *BitPos = Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1)); // Todo(1): In many cases it may be provable that SI is less than // ShiftBits in which case this mask is unnecessary // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in // favor of just a raw BT{S|R|C}. Result = Builder.CreateCall(BitTest, {Addr, BitPos}); Result = Builder.CreateZExtOrTrunc(Result, AI->getType()); // If the result is only used for zero/non-zero status then we don't need to // shift value back. Otherwise do so. for (auto It = I->user_begin(); It != I->user_end(); ++It) { if (auto *ICmp = dyn_cast(*It)) { if (ICmp->isEquality()) { auto *C0 = dyn_cast(ICmp->getOperand(0)); auto *C1 = dyn_cast(ICmp->getOperand(1)); if (C0 || C1) { assert(C0 == nullptr || C1 == nullptr); if ((C0 ? C0 : C1)->isZero()) continue; } } } Result = Builder.CreateShl(Result, BitPos); break; } } I->replaceAllUsesWith(Result); I->eraseFromParent(); AI->eraseFromParent(); } static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) { using namespace llvm::PatternMatch; if (!AI->hasOneUse()) return false; Value *Op = AI->getOperand(1); ICmpInst::Predicate Pred; Instruction *I = AI->user_back(); AtomicRMWInst::BinOp Opc = AI->getOperation(); if (Opc == AtomicRMWInst::Add) { if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value()))) return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE; if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) { if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) return Pred == CmpInst::ICMP_SLT; if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) return Pred == CmpInst::ICMP_SGT; } return false; } if (Opc == AtomicRMWInst::Sub) { if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value()))) return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE; if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) { if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) return Pred == CmpInst::ICMP_SLT; if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) return Pred == CmpInst::ICMP_SGT; } return false; } if ((Opc == AtomicRMWInst::Or && match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) || (Opc == AtomicRMWInst::And && match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) { if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT; if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) return Pred == CmpInst::ICMP_SGT; return false; } if (Opc == AtomicRMWInst::Xor) { if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value()))) return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE; if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) { if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) return Pred == CmpInst::ICMP_SLT; if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) return Pred == CmpInst::ICMP_SGT; } return false; } return false; } void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( AtomicRMWInst *AI) const { IRBuilder<> Builder(AI); Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections}); Instruction *TempI = nullptr; LLVMContext &Ctx = AI->getContext(); ICmpInst *ICI = dyn_cast(AI->user_back()); if (!ICI) { TempI = AI->user_back(); assert(TempI->hasOneUse() && "Must have one use"); ICI = cast(TempI->user_back()); } X86::CondCode CC = X86::COND_INVALID; ICmpInst::Predicate Pred = ICI->getPredicate(); switch (Pred) { default: llvm_unreachable("Not supported Pred"); case CmpInst::ICMP_EQ: CC = X86::COND_E; break; case CmpInst::ICMP_NE: CC = X86::COND_NE; break; case CmpInst::ICMP_SLT: CC = X86::COND_S; break; case CmpInst::ICMP_SGT: CC = X86::COND_NS; break; } Intrinsic::ID IID = Intrinsic::not_intrinsic; switch (AI->getOperation()) { default: llvm_unreachable("Unknown atomic operation"); case AtomicRMWInst::Add: IID = Intrinsic::x86_atomic_add_cc; break; case AtomicRMWInst::Sub: IID = Intrinsic::x86_atomic_sub_cc; break; case AtomicRMWInst::Or: IID = Intrinsic::x86_atomic_or_cc; break; case AtomicRMWInst::And: IID = Intrinsic::x86_atomic_and_cc; break; case AtomicRMWInst::Xor: IID = Intrinsic::x86_atomic_xor_cc; break; } Function *CmpArith = Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), PointerType::getUnqual(Ctx)); Value *Call = Builder.CreateCall( CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)}); Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx)); ICI->replaceAllUsesWith(Result); ICI->eraseFromParent(); if (TempI) TempI->eraseFromParent(); AI->eraseFromParent(); } TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); switch (Op) { case AtomicRMWInst::Xchg: return AtomicExpansionKind::None; case AtomicRMWInst::Add: case AtomicRMWInst::Sub: if (shouldExpandCmpArithRMWInIR(AI)) return AtomicExpansionKind::CmpArithIntrinsic; // It's better to use xadd, xsub or xchg for these in other cases. return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: if (shouldExpandCmpArithRMWInIR(AI)) return AtomicExpansionKind::CmpArithIntrinsic; return shouldExpandLogicAtomicRMWInIR(AI); case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: case AtomicRMWInst::FMax: case AtomicRMWInst::FMin: case AtomicRMWInst::UIncWrap: case AtomicRMWInst::UDecWrap: default: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. return AtomicExpansionKind::CmpXChg; } } LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. if (MemType->getPrimitiveSizeInBits() > NativeWidth) return nullptr; // If this is a canonical idempotent atomicrmw w/no uses, we have a better // lowering available in lowerAtomicArith. // TODO: push more cases through this path. if (auto *C = dyn_cast(AI->getValOperand())) if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && AI->use_empty()) return nullptr; IRBuilder<> Builder(AI); Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections}); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); // Before the load we need a fence. Here is an example lifted from // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence // is required: // Thread 0: // x.store(1, relaxed); // r1 = y.fetch_add(0, release); // Thread 1: // y.fetch_add(42, acquire); // r2 = x.load(relaxed); // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SSID == SyncScope::SingleThread) // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; if (!Subtarget.hasMFence()) // FIXME: it might make sense to use a locked operation here but on a // different cache-line to prevent cache-line bouncing. In practice it // is probably a small win, and x86 processors without mfence are rare // enough that we do not bother. return nullptr; Function *MFence = llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad( AI->getType(), AI->getPointerOperand(), AI->getAlign()); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; } /// Emit a locked operation on a stack location which does not change any /// memory location, but does involve a lock prefix. Location is chosen to be /// a) very likely accessed only by a single thread to minimize cache traffic, /// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL) { // Implementation notes: // 1) LOCK prefix creates a full read/write reordering barrier for memory // operations issued by the current processor. As such, the location // referenced is not relevant for the ordering properties of the instruction. // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions // 2) Using an immediate operand appears to be the best encoding choice // here since it doesn't require an extra register. // 3) OR appears to be very slightly faster than ADD. (Though, the difference // is small enough it might just be measurement noise.) // 4) When choosing offsets, there are several contributing factors: // a) If there's no redzone, we default to TOS. (We could allocate a cache // line aligned stack object to improve this case.) // b) To minimize our chances of introducing a false dependence, we prefer // to offset the stack usage from TOS slightly. // c) To minimize concerns about cross thread stack usage - in particular, // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which // captures state in the TOS frame and accesses it from many threads - // we want to use an offset such that the offset is in a distinct cache // line from the TOS frame. // // For a general discussion of the tradeoffs and benchmark results, see: // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ auto &MF = DAG.getMachineFunction(); auto &TFL = *Subtarget.getFrameLowering(); const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0; if (Subtarget.is64Bit()) { SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::RSP, MVT::i64), // Base DAG.getTargetConstant(1, DL, MVT::i8), // Scale DAG.getRegister(0, MVT::i64), // Index DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp DAG.getRegister(0, MVT::i16), // Segment. Zero, Chain}; SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops); return SDValue(Res, 1); } SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); SDValue Ops[] = { DAG.getRegister(X86::ESP, MVT::i32), // Base DAG.getTargetConstant(1, DL, MVT::i8), // Scale DAG.getRegister(0, MVT::i32), // Index DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp DAG.getRegister(0, MVT::i16), // Segment. Zero, Chain }; SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops); return SDValue(Res, 1); } static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast(Op.getConstantOperandVal(1)); SyncScope::ID FenceSSID = static_cast(Op.getConstantOperandVal(2)); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); return emitLockedStackOp(DAG, Subtarget, Chain, dl); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); } static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT T = Op.getSimpleValueType(); SDLoc DL(Op); unsigned Reg = 0; unsigned size = 0; switch(T.SimpleTy) { default: llvm_unreachable("Invalid value type!"); case MVT::i8: Reg = X86::AL; size = 1; break; case MVT::i16: Reg = X86::AX; size = 2; break; case MVT::i32: Reg = X86::EAX; size = 4; break; case MVT::i64: assert(Subtarget.is64Bit() && "Node not type legal!"); Reg = X86::RAX; size = 8; break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1) }; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, MVT::i32, cpOut.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), cpOut, Success, EFLAGS.getValue(1)); } // Create MOVMSKB, taking into account whether we need to split for AVX1. static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT InVT = V.getSimpleValueType(); if (InVT == MVT::v64i8) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(V, DL); Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget); Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget); Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, DAG.getConstant(32, DL, MVT::i8)); return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); } if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(V, DL); Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, DAG.getConstant(16, DL, MVT::i8)); return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi); } return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each // half to v32i1 and concatenating the result. if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) { assert(!Subtarget.is64Bit() && "Expected 32-bit mode"); assert(Subtarget.hasBWI() && "Expected BWI target"); SDLoc dl(Op); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32); Lo = DAG.getBitcast(MVT::v32i1, Lo); Hi = DAG.getBitcast(MVT::v32i1, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } // Use MOVMSK for vector to scalar conversion to prevent scalarization. if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) { assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512"); MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8; SDLoc DL(Op); SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT); V = getPMOVMSKB(DL, V, DAG, Subtarget); return DAG.getZExtOrTrunc(V, DL, DstVT); } assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) && !(DstVT == MVT::x86mmx && SrcVT.isVector())) // This conversion needs to be expanded. return SDValue(); SDLoc dl(Op); if (SrcVT.isVector()) { // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), SrcVT.getVectorNumElements() * 2); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, DAG.getUNDEF(SrcVT)); } else { assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && "Unexpected source type in LowerBITCAST"); Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); } MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); if (DstVT == MVT::x86mmx) return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, DAG.getIntPtrConstant(0, dl)); } /// Compute the horizontal sum of bytes in V for the elements of VT. /// /// Requires V to be a byte vector and VT to be an integer vector type with /// wider elements than V's type. The width of the elements of VT determines /// how many bytes of V are summed horizontally to produce each element of the /// result. static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(V); MVT ByteVecVT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); assert(ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."); assert(EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"); unsigned VecSize = VT.getSizeInBits(); assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"); // PSADBW instruction horizontally add all bytes and leave the result in i64 // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } if (EltVT == MVT::i32) { // We unpack the low half and high half into i32s interleaved with zeros so // that we can use PSADBW to horizontally sum them. The most useful part of // this is that it lines up the results of two PSADBW instructions to be // two v2i64 vectors which concatenated are the 4 population counts. We can // then use PACKUSWB to shrink and concatenate them into a v4i32 again. SDValue Zeros = DAG.getConstant(0, DL, VT); SDValue V32 = DAG.getBitcast(VT, V); SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros); SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros); // Do the horizontal sums into two v2i64s. Zeros = DAG.getConstant(0, DL, ByteVecVT); MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, DAG.getBitcast(ShortVecVT, Low), DAG.getBitcast(ShortVecVT, High)); return DAG.getBitcast(VT, V); } // The only element type left is i16. assert(EltVT == MVT::i16 && "Unknown how to handle type"); // To obtain pop count for each i16 element starting from the pop count for // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s // right by 8. It is important to shift as i16s as i8 vector shift isn't // directly supported. SDValue ShifterV = DAG.getConstant(8, DL, VT); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV); V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), DAG.getBitcast(ByteVecVT, V)); return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV); } static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); int NumElts = VT.getVectorNumElements(); (void)EltVT; assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."); // Implement a lookup table in register by using an algorithm based on: // http://wm.ite.pl/articles/sse-popcount.html // // The general idea is that every lower byte nibble in the input vector is an // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; SmallVector LUTVec; for (int i = 0; i < NumElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec); SDValue M0F = DAG.getConstant(0x0F, DL, VT); // High nibbles SDValue FourV = DAG.getConstant(4, DL, VT); SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV); // Low nibbles SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F); // The input vector is used as the shuffle mask that index elements into the // LUT. After counting low and high nibbles, add the vector to obtain the // final pop count per i8 element. SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles); SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles); return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); } // Please ensure that any codegen change from LowerVectorCTPOP is reflected in // updated cost models in X86TTIImpl::getIntrinsicInstrCost. static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && "Unknown CTPOP type to handle"); SDValue Op0 = Op.getOperand(0); // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. if (Subtarget.hasVPOPCNTDQ()) { unsigned NumElems = VT.getVectorNumElements(); assert((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"); if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) { MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0); Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op); return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } } // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntUnary(Op, DAG, DL); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG, DL); // For element types greater than i8, do vXi8 pop counts and a bytesum. if (VT.getScalarType() != MVT::i8) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue ByteOp = DAG.getBitcast(ByteVT, Op0); SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp); return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); } // We can't use the fast LUT approach, so fall back on LegalizeDAG. if (!Subtarget.hasSSSE3()) return SDValue(); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = N.getSimpleValueType(); SDValue Op = N.getOperand(0); SDLoc DL(N); if (VT.isScalarInteger()) { // Compute the lower/upper bounds of the active bits of the value, // allowing us to shift the active bits down if necessary to fit into the // special cases below. KnownBits Known = DAG.computeKnownBits(Op); unsigned LZ = Known.countMinLeadingZeros(); unsigned TZ = Known.countMinTrailingZeros(); assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask"); unsigned ActiveBits = Known.getBitWidth() - LZ; unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ); // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))". if (ShiftedActiveBits <= 2) { if (ActiveBits > 2) Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getShiftAmountConstant(TZ, VT, DL)); Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op, DAG.getNode(ISD::SRL, DL, MVT::i32, Op, DAG.getShiftAmountConstant(1, VT, DL))); return DAG.getZExtOrTrunc(Op, DL, VT); } // i3 CTPOP - perform LUT into i32 integer. if (ShiftedActiveBits <= 3) { if (ActiveBits > 3) Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getShiftAmountConstant(TZ, VT, DL)); Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op, DAG.getShiftAmountConstant(1, VT, DL)); Op = DAG.getNode(ISD::SRL, DL, MVT::i32, DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op); Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, DAG.getConstant(0x3, DL, MVT::i32)); return DAG.getZExtOrTrunc(Op, DL, VT); } // i4 CTPOP - perform LUT into i64 integer. if (ShiftedActiveBits <= 4 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) { SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64); if (ActiveBits > 4) Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getShiftAmountConstant(TZ, VT, DL)); Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, DAG.getConstant(4, DL, MVT::i32)); Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT, DAG.getShiftAmountOperand(MVT::i64, Op)); Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op, DAG.getConstant(0x7, DL, MVT::i64)); return DAG.getZExtOrTrunc(Op, DL, VT); } // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply. if (ShiftedActiveBits <= 8) { SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32); if (ActiveBits > 8) Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getShiftAmountConstant(TZ, VT, DL)); Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32); Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, DAG.getConstant(0x08040201U, DL, MVT::i32)); Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, DAG.getShiftAmountConstant(3, MVT::i32, DL)); Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11); Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11); Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op, DAG.getShiftAmountConstant(28, MVT::i32, DL)); return DAG.getZExtOrTrunc(Op, DL, VT); } return SDValue(); // fallback to generic expansion. } assert(VT.isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(N, DL, Subtarget, DAG); } static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); SDLoc DL(Op); // For scalars, its still beneficial to transfer to/from the SIMD unit to // perform the BITREVERSE. if (!VT.isVector()) { MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } int NumElts = VT.getVectorNumElements(); int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector()) return splitVectorIntUnary(Op, DAG, DL); assert(VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."); // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we // perform the BSWAP in the shuffle. // Its best to shuffle using the second operand as this will implicitly allow // memory folding for multiple vectors. SmallVector MaskElts; for (int i = 0; i != NumElts; ++i) { for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { int SourceByte = 16 + (i * ScalarSizeInBytes) + j; int PermuteByte = SourceByte | (2 << 5); MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); } } SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts); SDValue Res = DAG.getBitcast(MVT::v16i8, In); Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), Res, Mask); return DAG.getBitcast(VT, Res); } static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); SDValue In = Op.getOperand(0); SDLoc DL(Op); // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering. if (VT.is512BitVector() && !Subtarget.hasBWI()) return splitVectorIntUnary(Op, DAG, DL); // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. if (VT.is256BitVector() && !Subtarget.hasInt256()) return splitVectorIntUnary(Op, DAG, DL); // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP if (!VT.isVector()) { assert( (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) && "Only tested for i8/i16/i32/i64"); MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, Res)); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL)); return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res); } assert(VT.isVector() && VT.getSizeInBits() >= 128); // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE. if (VT.getScalarType() != MVT::i8) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In); Res = DAG.getBitcast(ByteVT, Res); Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res); return DAG.getBitcast(VT, Res); } assert(VT.isVector() && VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"); unsigned NumElts = VT.getVectorNumElements(); // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. if (Subtarget.hasGFNI()) { SDValue Matrix = getGFNICtrlMask(ISD::BITREVERSE, DAG, DL, VT); return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, DAG.getTargetConstant(0, DL, MVT::i8)); } // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each // 0-15 value (moved to the other nibble). SDValue NibbleMask = DAG.getConstant(0xF, DL, VT); SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask); SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT)); const int LoLUT[16] = { /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0, /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0, /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0, /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0}; const int HiLUT[16] = { /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C, /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E, /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D, /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F}; SmallVector LoMaskElts, HiMaskElts; for (unsigned i = 0; i < NumElts; ++i) { LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8)); HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8)); } SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts); SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts); Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi); return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); SDValue X = Op.getOperand(0); MVT VT = Op.getSimpleValueType(); // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. if (VT == MVT::i8 || DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, DAG.getConstant(0, DL, MVT::i8)); // Copy the inverse of the parity flag into a register with setcc. SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); // Extend to the original type. return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); } // If we have POPCNT, use the default expansion. if (Subtarget.hasPOPCNT()) return SDValue(); if (VT == MVT::i64) { // Xor the high and low 16-bits together using a 32-bit operation. SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, DAG.getNode(ISD::SRL, DL, MVT::i64, X, DAG.getConstant(32, DL, MVT::i8))); SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); } if (VT != MVT::i16) { // Xor the high and low 16-bits together using a 32-bit operation. SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(16, DL, MVT::i8)); X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16); } else { // If the input is 16-bits, we need to extend to use an i32 shift below. X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X); } // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. // This should allow an h-reg to be used to save a shift. SDValue Hi = DAG.getNode( ISD::TRUNCATE, DL, MVT::i8, DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8))); SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); // Copy the inverse of the parity flag into a register with setcc. SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); // Extend to the original type. return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); } static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NewOpc = 0; switch (N->getOpcode()) { case ISD::ATOMIC_LOAD_ADD: NewOpc = X86ISD::LADD; break; case ISD::ATOMIC_LOAD_SUB: NewOpc = X86ISD::LSUB; break; case ISD::ATOMIC_LOAD_OR: NewOpc = X86ISD::LOR; break; case ISD::ATOMIC_LOAD_XOR: NewOpc = X86ISD::LXOR; break; case ISD::ATOMIC_LOAD_AND: NewOpc = X86ISD::LAND; break; default: llvm_unreachable("Unknown ATOMIC_LOAD_ opcode"); } MachineMemOperand *MMO = cast(N)->getMemOperand(); return DAG.getMemIntrinsicNode( NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, /*MemVT=*/N->getSimpleValueType(0), MMO); } /// Lower atomic_load_ops into LOCK-prefixed operations. static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { AtomicSDNode *AN = cast(N.getNode()); SDValue Chain = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); unsigned Opc = N->getOpcode(); MVT VT = N->getSimpleValueType(0); SDLoc DL(N); // We can lower atomic_load_add into LXADD. However, any other atomicrmw op // can only be lowered when the result is unused. They should have already // been transformed into a cmpxchg loop in AtomicExpand. if (N->hasAnyUseOfValue(0)) { // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to // select LXADD if LOCK_SUB can't be selected. // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we // can use LXADD as opposed to cmpxchg. if (Opc == ISD::ATOMIC_LOAD_SUB || (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, DAG.getNegative(RHS, DL, VT), AN->getMemOperand()); assert(Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"); return N; } // Specialized lowering for the canonical form of an idemptotent atomicrmw. // The core idea here is that since the memory location isn't actually // changing, all we need is a lowering for the *ordering* impacts of the // atomicrmw. As such, we can chose a different operation and memory // location to minimize impact on other code. // The above holds unless the node is marked volatile in which // case it needs to be preserved according to the langref. if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) { // On X86, the only ordering which actually requires an instruction is // seq_cst which isn't SingleThread, everything just needs to be preserved // during codegen and then dropped. Note that we expect (but don't assume), // that orderings other than seq_cst and acq_rel have been canonicalized to // a store or load. if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent && AN->getSyncScopeID() == SyncScope::System) { // Prefer a locked operation against a stack location to minimize cache // traffic. This assumes that stack locations are very likely to be // accessed only by the owning thread. SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), NewChain); } // MEMBARRIER is a compiler barrier; it codegens to a no-op. SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), NewChain); } SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT), LockOp.getValue(1)); } static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { auto *Node = cast(Op.getNode()); SDLoc dl(Node); EVT VT = Node->getMemoryVT(); bool IsSeqCst = Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent; bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); // If this store is not sequentially consistent and the type is legal // we can just keep it. if (!IsSeqCst && IsTypeLegal) return Op; if (!IsTypeLegal && !Subtarget.useSoftFloat() && !DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) { SDValue Chain; // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a // vector store. if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) { SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal()); Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(), Node->getMemOperand()); } // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE // is enabled. if (VT == MVT::i64) { if (Subtarget.hasSSE1()) { SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal()); MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; SclToVec = DAG.getBitcast(StVT, SclToVec); SDVTList Tys = DAG.getVTList(MVT::Other); SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()}; Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); } else if (Subtarget.hasX87()) { // First load this into an 80-bit X87 register using a stack temporary. // This will put the whole integer into the significand. SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr, MPI, MaybeAlign(), MachineMemOperand::MOStore); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue LdOps[] = {Chain, StackPtr}; SDValue Value = DAG.getMemIntrinsicNode( X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI, /*Align*/ std::nullopt, MachineMemOperand::MOLoad); Chain = Value.getValue(1); // Now use an FIST to do the atomic store. SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()}; Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, Node->getMemOperand()); } } if (Chain) { // If this is a sequentially consistent store, also emit an appropriate // barrier. if (IsSeqCst) Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); return Chain; } } // Convert seq_cst store -> xchg // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(), Node->getOperand(0), Node->getOperand(2), Node->getOperand(1), Node->getMemOperand()); return Swap.getValue(1); } static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); MVT VT = N->getSimpleValueType(0); unsigned Opc = Op.getOpcode(); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDLoc DL(N); // Set the carry flag. SDValue Carry = Op.getOperand(2); EVT CarryVT = Carry.getValueType(); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry, DAG.getAllOnesConstant(DL, CarryVT)); bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY; SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry.getValue(1)); bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY; SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B, Sum.getValue(1), DL, DAG); if (N->getValueType(1) == MVT::i1) SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); // For MacOSX, we want to call an alternative entry point: __sincos_stret, // which returns the values as { float, float } (in XMM0) or // { double, double } (which is returned in XMM0, XMM1). SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) : (Type *)FixedVectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair CallResult = TLI.LowerCallTo(CLI); if (isF64) // Returned in xmm0 and xmm1. return CallResult.first; // Returned in bits 0:31 and 32:64 xmm0. SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(0, dl)); SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, DAG.getIntPtrConstant(1, dl)); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes = false) { // Check if InOp already has the right width. MVT InVT = InOp.getSimpleValueType(); if (InVT == NVT) return InOp; if (InOp.isUndef()) return DAG.getUNDEF(NVT); assert(InVT.getVectorElementType() == NVT.getVectorElementType() && "input and widen element type must match"); unsigned InNumElts = InVT.getVectorNumElements(); unsigned WidenNumElts = NVT.getVectorNumElements(); assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"); SDLoc dl(InOp); if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) { SDValue N1 = InOp.getOperand(1); if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || N1.isUndef()) { InOp = InOp.getOperand(0); InVT = InOp.getSimpleValueType(); InNumElts = InVT.getVectorNumElements(); } } if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { SmallVector Ops; for (unsigned i = 0; i < InNumElts; ++i) Ops.push_back(InOp.getOperand(i)); EVT EltVT = InOp.getOperand(0).getValueType(); SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) Ops.push_back(FillVal); return DAG.getBuildVector(NVT, dl, Ops); } SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp, DAG.getIntPtrConstant(0, dl)); } static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedScatterSDNode *N = cast(Op.getNode()); SDValue Src = N->getValue(); MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); SDValue Scale = N->getScale(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); if (VT == MVT::v2f32 || VT == MVT::v2i32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, N->getMemoryVT(), N->getMemOperand()); } return SDValue(); } MVT IndexVT = Index.getSimpleValueType(); // If the index is v2i32, we're being called by type legalization and we // should just let the default handling take care of it. if (IndexVT == MVT::v2i32) return SDValue(); // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // Determine how much we need to widen by to get a 512-bit type. unsigned Factor = std::min(512/VT.getSizeInBits(), 512/IndexVT.getSizeInBits()); unsigned NumElts = VT.getVectorNumElements() * Factor; VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); Src = ExtendToType(Src, VT, DAG); Index = ExtendToType(Index, IndexVT, DAG); Mask = ExtendToType(Mask, MaskVT, DAG, true); } SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, N->getMemoryVT(), N->getMemOperand()); } static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedLoadSDNode *N = cast(Op.getNode()); MVT VT = Op.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); MVT MaskVT = Mask.getSimpleValueType(); SDValue PassThru = N->getPassThru(); SDLoc dl(Op); // Handle AVX masked loads which don't support passthru other than 0. if (MaskVT.getVectorElementType() != MVT::i1) { // We also allow undef in the isel pattern. if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) return Op; SDValue NewLoad = DAG.getMaskedLoad( VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); // Emit a blend. SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); } assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"); assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op."); assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); PassThru = ExtendToType(PassThru, WideDataVT, DAG); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && "Unexpected mask type"); MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); SDValue NewLoad = DAG.getMaskedLoad( WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), DAG.getIntPtrConstant(0, dl)); SDValue RetOps[] = {Extract, NewLoad.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MaskedStoreSDNode *N = cast(Op.getNode()); SDValue DataToStore = N->getValue(); MVT VT = DataToStore.getSimpleValueType(); MVT ScalarVT = VT.getScalarType(); SDValue Mask = N->getMask(); SDLoc dl(Op); assert((!N->isCompressingStore() || Subtarget.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"); assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"); assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op."); assert((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."); // This operation is legal for targets with VLX, but without // VLX the vector should be widened to 512 bit unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); // Mask element has to be i1. assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 && "Unexpected mask type"); MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), N->getOffset(), Mask, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore()); } static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue PassThru = N->getPassThru(); MVT IndexVT = Index.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); // If the index is v2i32, we're being called by type legalization. if (IndexVT == MVT::v2i32) return SDValue(); // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. MVT OrigVT = VT; if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !IndexVT.is512BitVector()) { // Determine how much we need to widen by to get a 512-bit type. unsigned Factor = std::min(512/VT.getSizeInBits(), 512/IndexVT.getSizeInBits()); unsigned NumElts = VT.getVectorNumElements() * Factor; VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); PassThru = ExtendToType(PassThru, VT, DAG); Index = ExtendToType(Index, IndexVT, DAG); Mask = ExtendToType(Mask, MaskVT, DAG, true); } // Break dependency on the data register. if (PassThru.isUndef()) PassThru = getZeroVector(VT, Subtarget, DAG, dl); SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, N->getScale() }; SDValue NewGather = DAG.getMemIntrinsicNode( X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(), N->getMemOperand()); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather, DAG.getIntPtrConstant(0, dl)); return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl); } static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); SDValue Src = Op.getOperand(0); MVT DstVT = Op.getSimpleValueType(); AddrSpaceCastSDNode *N = cast(Op.getNode()); unsigned SrcAS = N->getSrcAddressSpace(); assert(SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"); if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) { Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); } else if (DstVT == MVT::i64) { Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); } else if (DstVT == MVT::i32) { Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); } else { report_fatal_error("Bad address space in addrspacecast"); } return Op; } SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically // no-ops in the case of a null GC strategy (or a GC strategy which does not // require special handling for these nodes), lower them as literal NOOPs for // the time being. SmallVector Ops; Ops.push_back(Op.getOperand(0)); if (Op->getGluedNode()) Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); } // Custom split CVTPS2PH with wide types. static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); EVT VT = Op.getValueType(); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); SDValue RC = Op.getOperand(1); Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC); Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { unsigned IsData = Op.getConstantOperandVal(4); // We don't support non-data prefetch without PREFETCHI. // Just preserve the chain. if (!IsData && !Subtarget.hasPREFETCHI()) return Op.getOperand(0); return Op; } static StringRef getInstrStrFromOpNo(const SmallVectorImpl &AsmStrs, unsigned OpNo) { const APInt Operand(32, OpNo); std::string OpNoStr = llvm::toString(Operand, 10, false); std::string Str(" $"); std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1) std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P} auto I = StringRef::npos; for (auto &AsmStr : AsmStrs) { // Match the OpNo string. We should match exactly to exclude match // sub-string, e.g. "$12" contain "$1" if (AsmStr.ends_with(OpNoStr1)) I = AsmStr.size() - OpNoStr1.size(); // Get the index of operand in AsmStr. if (I == StringRef::npos) I = AsmStr.find(OpNoStr1 + ","); if (I == StringRef::npos) I = AsmStr.find(OpNoStr2); if (I == StringRef::npos) continue; assert(I > 0 && "Unexpected inline asm string!"); // Remove the operand string and label (if exsit). // For example: // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}" // ==> // ".L__MSASMLABEL_.${:uid}__l:call dword ptr " // ==> // "call dword ptr " auto TmpStr = AsmStr.substr(0, I); I = TmpStr.rfind(':'); if (I != StringRef::npos) TmpStr = TmpStr.substr(I + 1); return TmpStr.take_while(llvm::isAlpha); } return StringRef(); } bool X86TargetLowering::isInlineAsmTargetBranch( const SmallVectorImpl &AsmStrs, unsigned OpNo) const { // In a __asm block, __asm inst foo where inst is CALL or JMP should be // changed from indirect TargetLowering::C_Memory to direct // TargetLowering::C_Address. // We don't need to special case LOOP* and Jcc, which cannot target a memory // location. StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo); return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp"); } static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask) { EVT Ty = MVT::i8; auto V = DAG.getBitcast(MVT::i1, Mask); auto VE = DAG.getZExtOrTrunc(V, DL, Ty); auto Zero = DAG.getConstant(0, DL, Ty); SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32); auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE); return SDValue(CmpZero.getNode(), 1); } SDValue X86TargetLowering::visitMaskedLoad( SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const { // @llvm.masked.load.v1*(ptr, alignment, mask, passthru) // -> // _, flags = SUB 0, mask // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags // bit_cast_to_vector EVT VTy = PassThru.getValueType(); EVT Ty = VTy.getVectorElementType(); SDVTList Tys = DAG.getVTList(Ty, MVT::Other); auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty) : DAG.getBitcast(Ty, PassThru); auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask); auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags}; NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO); return DAG.getBitcast(VTy, NewLoad); } SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const { // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask) // -> // _, flags = SUB 0, mask // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags EVT Ty = Val.getValueType().getVectorElementType(); SDVTList Tys = DAG.getVTList(MVT::Other); auto ScalarVal = DAG.getBitcast(Ty, Val); auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask); auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags}; return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO); } /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { // clang-format off default: llvm_unreachable("Should not custom lower this!"); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); case ISD::FP_EXTEND: case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP16_TO_FP: case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG); case ISD::FP_TO_FP16: case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::LRINT: case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG); case ISD::SETCC: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::FRAME_TO_ARGS_OFFSET: return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG); case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG); case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::ROTL: case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: return LowerXALUO(Op, DAG); case ISD::SMULO: case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::SADDO_CARRY: case ISD::SSUBO_CARRY: case ISD::UADDO_CARRY: case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG); case ISD::ADD: case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget); case ISD::UADDSAT: case ISD::SADDSAT: case ISD::USUBSAT: case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG); case ISD::FMINIMUM: case ISD::FMAXIMUM: return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG); case ISD::ABS: return LowerABS(Op, Subtarget, DAG); case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG); // clang-format on } } /// Replace a node with an illegal result type with a new node built out of /// custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const { SDLoc dl(N); switch (N->getOpcode()) { default: #ifndef NDEBUG dbgs() << "ReplaceNodeResults: "; N->dump(&DAG); #endif llvm_unreachable("Do not know how to custom type legalize this operation!"); case X86ISD::CVTPH2PS: { EVT VT = N->getValueType(0); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo); Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); return; } case X86ISD::STRICT_CVTPH2PS: { EVT VT = N->getValueType(0); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other}, {N->getOperand(0), Lo}); Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other}, {N->getOperand(0), Hi}); SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1), Hi.getValue(1)); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); Results.push_back(Chain); return; } case X86ISD::CVTPS2PH: Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG)); return; case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // If we have at most 32 active bits, then perform as i32 CTPOP. // TODO: Perform this in generic legalizer? KnownBits Known = DAG.computeKnownBits(N->getOperand(0)); unsigned LZ = Known.countMinLeadingZeros(); unsigned TZ = Known.countMinTrailingZeros(); if ((LZ + TZ) >= 32) { SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0), DAG.getShiftAmountConstant(TZ, MVT::i64, dl)); Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op); Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op); Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op); Results.push_back(Op); return; } // Use a v2i64 if possible. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) { SDValue Wide = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0)); Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide); // Bit count should fit in 32-bits, extract it as that and then zero // extend to i64. Otherwise we end up extracting bits 63:32 separately. Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, DAG.getIntPtrConstant(0, dl)); Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); Results.push_back(Wide); } return; } case ISD::MUL: { EVT VT = N->getValueType(0); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"); // Pre-promote these to vXi16 to avoid op legalization thinking all 16 // elements are needed. MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); unsigned NumConcats = 16 / VT.getVectorNumElements(); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); Results.push_back(Res); return; } case ISD::SMULO: case ISD::UMULO: { EVT VT = N->getValueType(0); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"); bool IsSigned = N->getOpcode() == ISD::SMULO; unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0)); SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1)); SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1); // Extract the high 32 bits from each result using PSHUFD. // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD. SDValue Hi = DAG.getBitcast(MVT::v4i32, Res); Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1}); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi, DAG.getIntPtrConstant(0, dl)); // Truncate the low bits of the result. This will become PSHUFD. Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); SDValue HiCmp; if (IsSigned) { // SMULO overflows if the high bits don't match the sign of the low. HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT)); } else { // UMULO overflows if the high bits are non-zero. HiCmp = DAG.getConstant(0, dl, VT); } SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE); // Widen the result with by padding with undef. Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, DAG.getUNDEF(VT)); Results.push_back(Res); Results.push_back(Ovf); return; } case X86ISD::VPMADDWD: { // Legalize types for X86ISD::VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); EVT InVT = N->getOperand(0).getValueType(); assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); unsigned NumConcat = 128 / InVT.getSizeInBits(); EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), NumConcat * InVT.getVectorNumElements()); EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumConcat * VT.getVectorNumElements()); SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = N->getOperand(0); SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); Ops[0] = N->getOperand(1); SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); Results.push_back(Res); return; } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(1), UNDEF); Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); return; } case ISD::SDIV: case ISD::UDIV: case ISD::SREM: case ISD::UREM: { EVT VT = N->getValueType(0); if (VT.isVector()) { assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); // If this RHS is a constant splat vector we can widen this and let // division/remainder by constant optimize it. // TODO: Can we do something for non-splat? APInt SplatVal; if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) { unsigned NumConcats = 128 / VT.getSizeInBits(); SmallVector Ops0(NumConcats, DAG.getUNDEF(VT)); Ops0[0] = N->getOperand(0); EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0); SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT); SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1); Results.push_back(Res); } return; } SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); Results.push_back(V); return; } case ISD::TRUNCATE: { MVT VT = N->getSimpleValueType(0); if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) return; // The generic legalizer will try to widen the input type to the same // number of elements as the widened result type. But this isn't always // the best thing so do some custom legalization to avoid some cases. MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT(); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); EVT InEltVT = InVT.getVectorElementType(); EVT EltVT = VT.getVectorElementType(); unsigned MinElts = VT.getVectorNumElements(); unsigned WidenNumElts = WidenVT.getVectorNumElements(); unsigned InBits = InVT.getSizeInBits(); // See if there are sufficient leading bits to perform a PACKUS/PACKSS. unsigned PackOpcode; if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) { if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) { Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl); Results.push_back(Res); return; } } if ((128 % InBits) == 0 && WidenVT.is128BitVector()) { // 128 bit and smaller inputs should avoid truncate all together and // use a shuffle. if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) { int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits(); SmallVector TruncMask(WidenNumElts, -1); for (unsigned I = 0; I < MinElts; ++I) TruncMask[I] = Scale * I; SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128); assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) && "Illegal vector type in truncation"); WidenIn = DAG.getBitcast(WidenVT, WidenIn); Results.push_back( DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask)); return; } } // With AVX512 there are some cases that can use a target specific // truncate node to go from 256/512 to less than 128 with zeros in the // upper elements of the 128 bit result. if (Subtarget.hasAVX512() && isTypeLegal(InVT)) { // We can use VTRUNC directly if for 256 bits with VLX or for any 512. if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) { Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); return; } // There's one case we can widen to 512 bits and use VTRUNC. if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) { In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In, DAG.getUNDEF(MVT::v4i64)); Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); return; } } if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 && getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector && isTypeLegal(MVT::v4i64)) { // Input needs to be split and output needs to widened. Let's use two // VTRUNCs, and shuffle their results together into the wider type. SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(In, dl); Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, { 0, 1, 2, 3, 16, 17, 18, 19, -1, -1, -1, -1, -1, -1, -1, -1 }); Results.push_back(Res); return; } // Attempt to widen the truncation input vector to let LowerTRUNCATE handle // this via type legalization. if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) && (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) && (!Subtarget.hasSSSE3() || (!isTypeLegal(InVT) && !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) { SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, InEltVT.getSizeInBits() * WidenNumElts); Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn)); return; } return; } case ISD::ANY_EXTEND: // Right now, only MVT::v8i8 has Custom action for an illegal type. // It's intended to custom handle the input type. assert(N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"); return; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && "Unexpected type action!"); assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting // we allow the sra from the extend to i32 to be shared by the split. In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); // Fill a vector with sign bits for each element. SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT); // Create an unpackl and unpackh to interleave the sign bits then bitcast // to v2i64. SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {0, 4, 1, 5}); Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo); SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {2, 6, 3, 7}); Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); return; } if (VT == MVT::v16i32 || VT == MVT::v8i64) { if (!InVT.is128BitVector()) { // Not a 128 bit vector, but maybe type legalization will promote // it to 128 bits. if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger) return; InVT = getTypeToTransformTo(*DAG.getContext(), InVT); if (!InVT.is128BitVector()) return; // Promote the input to 128 bits. Type legalization will turn this into // zext_inreg/sext_inreg. In = DAG.getNode(N->getOpcode(), dl, InVT, In); } // Perform custom splitting instead of the two stage extend we would get // by default. EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); assert(isTypeLegal(LoVT) && "Split VT not legal?"); SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG); // We need to shift the input over by half the number of elements. unsigned NumElts = InVT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; SmallVector ShufMask(NumElts, SM_SentinelUndef); for (unsigned i = 0; i != HalfNumElts; ++i) ShufMask[i] = i + HalfNumElts; SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); Results.push_back(Res); } return; } case ISD::FP_TO_SINT: case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_UINT: { bool IsStrict = N->isStrictFPOpcode(); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT || N->getOpcode() == ISD::STRICT_FP_TO_SINT; EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); EVT SrcVT = Src.getValueType(); SDValue Res; if (isSoftF16(SrcVT, Subtarget)) { EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; if (IsStrict) { Res = DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other}, {Chain, Src})}); Chain = Res.getValue(1); } else { Res = DAG.getNode(N->getOpcode(), dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); } Results.push_back(Res); if (IsStrict) Results.push_back(Chain); return; } if (VT.isVector() && Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) { EVT EleVT = VT.getVectorElementType(); EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; if (SrcVT != MVT::v8f16) { SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT); SmallVector Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp); Ops[0] = Src; Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); } if (IsStrict) { unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); } else { unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; Res = DAG.getNode(Opc, dl, ResVT, Src); } // TODO: Need to add exception check code for strict FP. if (EleVT.getSizeInBits() < 16) { MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8); Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res); // Now widen to 128 bits. unsigned NumConcats = 128 / TmpVT.getSizeInBits(); MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(TmpVT)); ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); } Results.push_back(Res); if (IsStrict) Results.push_back(Chain); return; } if (VT.isVector() && VT.getScalarSizeInBits() < 32) { assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); // Try to create a 128 bit vector, but don't exceed a 32 bit element. unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), VT.getVectorNumElements()); SDValue Res; SDValue Chain; if (IsStrict) { Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); // Preserve what we know about the size of the original result. If the // result is v2i32, we have to manually widen the assert. if (PromoteVT == MVT::v2i32) Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, DAG.getUNDEF(MVT::v2i32)); Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl, Res.getValueType(), Res, DAG.getValueType(VT.getVectorElementType())); if (PromoteVT == MVT::v2i32) Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, DAG.getIntPtrConstant(0, dl)); // Truncate back to the original width. Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); // Now widen to 128 bits. unsigned NumConcats = 128 / VT.getSizeInBits(); MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), VT.getVectorNumElements() * NumConcats); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(VT)); ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); Results.push_back(Res); if (IsStrict) Results.push_back(Chain); return; } if (VT == MVT::v2i32) { assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) && "Strict unsigned conversion requires AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { if (!IsSigned && !Subtarget.hasAVX512()) { SDValue Res = expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget); Results.push_back(Res); return; } unsigned Opc; if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; else Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; // If we have VLX we can emit a target specific FP_TO_UINT node,. if (!IsSigned && !Subtarget.hasVLX()) { // Otherwise we can defer to the generic legalizer which will widen // the input as well. This will be further widened during op // legalization to v8i32<-v8f64. // For strict nodes we'll need to widen ourselves. // FIXME: Fix the type legalizer to safely widen strict nodes? if (!IsStrict) return; Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src, DAG.getConstantFP(0.0, dl, MVT::v2f64)); Opc = N->getOpcode(); } SDValue Res; SDValue Chain; if (IsStrict) { Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); } else { Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); } Results.push_back(Res); if (IsStrict) Results.push_back(Chain); return; } // Custom widen strict v2f32->v2i32 by padding with zeros. // FIXME: Should generic type legalizer do this? if (Src.getValueType() == MVT::v2f32 && IsStrict) { Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getConstantFP(0.0, dl, MVT::v2f32)); SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other}, {N->getOperand(0), Src}); Results.push_back(Res); Results.push_back(Res.getValue(1)); return; } // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs, // so early out here. return; } assert(!VT.isVector() && "Vectors should have been handled above!"); if ((Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) || (Subtarget.hasFP16() && SrcVT == MVT::f16)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); unsigned NumElts = Subtarget.hasVLX() ? 2 : 8; // If we use a 128-bit result we might need to use a target specific node. unsigned SrcElts = std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits()); MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts); MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts); unsigned Opc = N->getOpcode(); if (NumElts != SrcElts) { if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; else Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; } SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx); SDValue Chain; if (IsStrict) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res); Chain = Res.getValue(1); } else Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx); Results.push_back(Res); if (IsStrict) Results.push_back(Chain); return; } if (VT == MVT::i128 && Subtarget.isTargetWin64()) { SDValue Chain; SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain); Results.push_back(V); if (IsStrict) Results.push_back(Chain); return; } if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) { Results.push_back(V); if (IsStrict) Results.push_back(Chain); } return; } case ISD::LRINT: case ISD::LLRINT: { if (SDValue V = LRINT_LLRINTHelper(N, DAG)) Results.push_back(V); return; } case ISD::SINT_TO_FP: case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::STRICT_UINT_TO_FP: { bool IsStrict = N->isStrictFPOpcode(); bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP || N->getOpcode() == ISD::STRICT_SINT_TO_FP; EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() && Subtarget.hasVLX()) { if (Src.getValueType().getVectorElementType() == MVT::i16) return; if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32) Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, IsStrict ? DAG.getConstant(0, dl, MVT::v2i32) : DAG.getUNDEF(MVT::v2i32)); if (IsStrict) { unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P; SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, {N->getOperand(0), Src}); Results.push_back(Res); Results.push_back(Res.getValue(1)); } else { unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src)); } return; } if (VT != MVT::v2f32) return; EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { if (IsStrict) { unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P; SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other}, {N->getOperand(0), Src}); Results.push_back(Res); Results.push_back(Res.getValue(1)); } else { unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src)); } return; } if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && Subtarget.hasSSE41() && !Subtarget.hasAVX512()) { SDValue Zero = DAG.getConstant(0, dl, SrcVT); SDValue One = DAG.getConstant(1, dl, SrcVT); SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT); SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); SmallVector SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); for (int i = 0; i != 2; ++i) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, SignSrc, DAG.getIntPtrConstant(i, dl)); if (IsStrict) SignCvts[i] = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other}, {N->getOperand(0), Elt}); else SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt); }; SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); SDValue Slow, Chain; if (IsStrict) { Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SignCvts[0].getValue(1), SignCvts[1].getValue(1)); Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other}, {Chain, SignCvt, SignCvt}); Chain = Slow.getValue(1); } else { Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt); } IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg); IsNeg = DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1}); SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt); Results.push_back(Cvt); if (IsStrict) Results.push_back(Chain); return; } if (SrcVT != MVT::v2i32) return; if (IsSigned || Subtarget.hasAVX512()) { if (!IsStrict) return; // Custom widen strict v2i32->v2f32 to avoid scalarization. // FIXME: Should generic type legalizer do this? Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getConstant(0, dl, MVT::v2i32)); SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, {N->getOperand(0), Src}); Results.push_back(Res); Results.push_back(Res.getValue(1)); return; } assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); SDValue VBias = DAG.getConstantFP( llvm::bit_cast(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); if (IsStrict) { SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, {N->getOperand(0), Or, VBias}); SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other}, {Sub.getValue(1), Sub}); Results.push_back(Res); Results.push_back(Res.getValue(1)); } else { // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); } return; } case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: { bool IsStrict = N->isStrictFPOpcode(); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); SDValue Rnd = N->getOperand(IsStrict ? 2 : 1); EVT SrcVT = Src.getValueType(); EVT VT = N->getValueType(0); SDValue V; if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) { SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32) : DAG.getUNDEF(MVT::v2f32); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext); } if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) { assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C"); if (SrcVT.getVectorElementType() != MVT::f32) return; if (IsStrict) V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, {Chain, Src, Rnd}); else V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd); Results.push_back(DAG.getBitcast(MVT::v8f16, V)); if (IsStrict) Results.push_back(V.getValue(1)); return; } if (!isTypeLegal(Src.getValueType())) return; EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; if (IsStrict) V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other}, {Chain, Src}); else V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src); Results.push_back(V); if (IsStrict) Results.push_back(V.getValue(1)); return; } case ISD::FP_EXTEND: case ISD::STRICT_FP_EXTEND: { // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. // No other ValueType for FP_EXTEND should reach this point. assert(N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"); if (!Subtarget.hasFP16() || !Subtarget.hasVLX()) return; bool IsStrict = N->isStrictFPOpcode(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); if (Src.getValueType().getVectorElementType() != MVT::f16) return; SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16) : DAG.getUNDEF(MVT::v2f16); SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext); if (IsStrict) V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other}, {N->getOperand(0), V}); else V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V); Results.push_back(V); if (IsStrict) Results.push_back(V.getValue(1)); return; } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = N->getConstantOperandVal(1); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); case Intrinsic::x86_rdtsc: return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); case Intrinsic::x86_rdtscp: return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget, Results); case Intrinsic::x86_rdpmc: expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, Results); return; case Intrinsic::x86_rdpru: expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget, Results); return; case Intrinsic::x86_xgetbv: expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, Results); return; } } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); } case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"); MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; std::tie(cpInL, cpInH) = DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT); cpInL = DAG.getCopyToReg(N->getOperand(0), dl, Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX, cpInH, cpInL.getValue(1)); SDValue swapInL, swapInH; std::tie(swapInL, swapInH) = DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT); swapInH = DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, swapInH, cpInH.getValue(1)); // In 64-bit mode we might need the base pointer in RBX, but we can't know // until later. So we keep the RBX input in a vreg and use a custom // inserter. // Since RBX will be a reserved register the register allocator will not // make sure its value will be properly saved and restored around this // live-range. SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast(N)->getMemOperand(); if (Regs64bit) { SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL, swapInH.getValue(1)}; Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO); } else { swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL, swapInH.getValue(1)); SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), swapInL.getValue(1)}; Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO); } SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, Regs64bit ? X86::RAX : X86::EAX, HalfT, Result.getValue(1)); SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, MVT::i32, cpOutH.getValue(2)); SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG); Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); Results.push_back(Success); Results.push_back(EFLAGS.getValue(1)); return; } case ISD::ATOMIC_LOAD: { assert( (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) && "Unexpected VT!"); bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { auto *Node = cast(N); if (N->getValueType(0) == MVT::i128) { if (Subtarget.is64Bit() && Subtarget.hasAVX()) { SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(), Node->getBasePtr(), Node->getMemOperand()); SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, DAG.getIntPtrConstant(0, dl)); SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, DAG.getIntPtrConstant(1, dl)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0), {ResL, ResH})); Results.push_back(Ld.getValue(1)); return; } break; } if (Subtarget.hasSSE1()) { // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS. // Then extract the lower 64-bits. MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; SDVTList Tys = DAG.getVTList(LdVT, MVT::Other); SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); if (Subtarget.hasSSE2()) { SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); Results.push_back(Ld.getValue(1)); return; } // We use an alternative sequence for SSE1 that extracts as v2f32 and // then casts to i64. This avoids a 128-bit stack temporary being // created by type legalization if we were to cast v4f32->v2i64. SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld, DAG.getIntPtrConstant(0, dl)); Res = DAG.getBitcast(MVT::i64, Res); Results.push_back(Res); Results.push_back(Ld.getValue(1)); return; } if (Subtarget.hasX87()) { // First load this into an 80-bit X87 register. This will put the whole // integer into the significand. SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); SDValue Chain = Result.getValue(1); // Now store the X87 register to a stack temporary and convert to i64. // This store is not atomic and doesn't need to be. // FIXME: We don't need a stack temporary if the result of the load // is already being stored. We could just directly store there. SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); int SPFI = cast(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); SDValue StoreOps[] = { Chain, Result, StackPtr }; Chain = DAG.getMemIntrinsicNode( X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore); // Finally load the value back from the stack temporary and return it. // This load is not atomic and doesn't need to be. // This load will be further type legalized. Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI); Results.push_back(Result); Results.push_back(Result.getValue(1)); return; } } // TODO: Use MOVLPS when SSE1 is available? // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; } case ISD::ATOMIC_SWAP: case ISD::ATOMIC_LOAD_ADD: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_NAND: case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target // we can split using the k-register rather than memory. if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) { assert(!Subtarget.is64Bit() && "Expected 32-bit mode"); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); Lo = DAG.getBitcast(MVT::i32, Lo); Hi = DAG.getBitcast(MVT::i32, Hi); SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); Results.push_back(Res); return; } if (DstVT.isVector() && SrcVT == MVT::x86mmx) { // FIXME: Use v4f32 for SSE1? assert(Subtarget.hasSSE2() && "Requires SSE2"); assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && "Unexpected type action!"); EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, N->getOperand(0)); Res = DAG.getBitcast(WideVT, Res); Results.push_back(Res); return; } return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); if ((VT == MVT::v2f32 || VT == MVT::v2i32) && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Gather->getPassThru(), DAG.getUNDEF(VT)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, DAG.getUNDEF(MVT::v2i1)); Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); } SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getMemIntrinsicNode( X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(1)); return; } return; } case ISD::LOAD: { // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp // cast since type legalization will try to use an i64 load. MVT VT = N->getSimpleValueType(0); assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); if (Subtarget.hasSSE2()) { MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); MVT VecVT = MVT::getVectorVT(LdVT, 2); Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res); EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); Res = DAG.getBitcast(WideVT, Res); Results.push_back(Res); Results.push_back(Chain); return; } assert(Subtarget.hasSSE1() && "Expected SSE"); SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, Ld->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(1)); return; } case ISD::ADDRSPACECAST: { SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG); Results.push_back(V); return; } case ISD::BITREVERSE: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); assert(Subtarget.hasXOP() && "Expected XOP"); // We can use VPPERM by copying to a vector register and back. We'll need // to move the scalar in two i32 pieces. Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG)); return; } case ISD::EXTRACT_VECTOR_ELT: { // f16 = extract vXf16 %vec, i64 %idx assert(N->getSimpleValueType(0) == MVT::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"); assert(Subtarget.hasFP16() && "Expected FP16"); SDValue VecOp = N->getOperand(0); EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger(); SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0)); Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split, N->getOperand(1)); Split = DAG.getBitcast(MVT::f16, Split); Results.push_back(Split); return; } } } const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((X86ISD::NodeType)Opcode) { case X86ISD::FIRST_NUMBER: break; #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE; NODE_NAME_CASE(BSF) NODE_NAME_CASE(BSR) NODE_NAME_CASE(FSHL) NODE_NAME_CASE(FSHR) NODE_NAME_CASE(FAND) NODE_NAME_CASE(FANDN) NODE_NAME_CASE(FOR) NODE_NAME_CASE(FXOR) NODE_NAME_CASE(FILD) NODE_NAME_CASE(FIST) NODE_NAME_CASE(FP_TO_INT_IN_MEM) NODE_NAME_CASE(FLD) NODE_NAME_CASE(FST) NODE_NAME_CASE(CALL) NODE_NAME_CASE(CALL_RVMARKER) NODE_NAME_CASE(BT) NODE_NAME_CASE(CMP) NODE_NAME_CASE(FCMP) NODE_NAME_CASE(STRICT_FCMP) NODE_NAME_CASE(STRICT_FCMPS) NODE_NAME_CASE(COMI) NODE_NAME_CASE(UCOMI) NODE_NAME_CASE(CMPM) NODE_NAME_CASE(CMPMM) NODE_NAME_CASE(STRICT_CMPM) NODE_NAME_CASE(CMPMM_SAE) NODE_NAME_CASE(SETCC) NODE_NAME_CASE(SETCC_CARRY) NODE_NAME_CASE(FSETCC) NODE_NAME_CASE(FSETCCM) NODE_NAME_CASE(FSETCCM_SAE) NODE_NAME_CASE(CMOV) NODE_NAME_CASE(BRCOND) NODE_NAME_CASE(RET_GLUE) NODE_NAME_CASE(IRET) NODE_NAME_CASE(REP_STOS) NODE_NAME_CASE(REP_MOVS) NODE_NAME_CASE(GlobalBaseReg) NODE_NAME_CASE(Wrapper) NODE_NAME_CASE(WrapperRIP) NODE_NAME_CASE(MOVQ2DQ) NODE_NAME_CASE(MOVDQ2Q) NODE_NAME_CASE(MMX_MOVD2W) NODE_NAME_CASE(MMX_MOVW2D) NODE_NAME_CASE(PEXTRB) NODE_NAME_CASE(PEXTRW) NODE_NAME_CASE(INSERTPS) NODE_NAME_CASE(PINSRB) NODE_NAME_CASE(PINSRW) NODE_NAME_CASE(PSHUFB) NODE_NAME_CASE(ANDNP) NODE_NAME_CASE(BLENDI) NODE_NAME_CASE(BLENDV) NODE_NAME_CASE(HADD) NODE_NAME_CASE(HSUB) NODE_NAME_CASE(FHADD) NODE_NAME_CASE(FHSUB) NODE_NAME_CASE(CONFLICT) NODE_NAME_CASE(FMAX) NODE_NAME_CASE(FMAXS) NODE_NAME_CASE(FMAX_SAE) NODE_NAME_CASE(FMAXS_SAE) NODE_NAME_CASE(FMIN) NODE_NAME_CASE(FMINS) NODE_NAME_CASE(FMIN_SAE) NODE_NAME_CASE(FMINS_SAE) NODE_NAME_CASE(FMAXC) NODE_NAME_CASE(FMINC) NODE_NAME_CASE(FRSQRT) NODE_NAME_CASE(FRCP) NODE_NAME_CASE(EXTRQI) NODE_NAME_CASE(INSERTQI) NODE_NAME_CASE(TLSADDR) NODE_NAME_CASE(TLSBASEADDR) NODE_NAME_CASE(TLSCALL) NODE_NAME_CASE(TLSDESC) NODE_NAME_CASE(EH_SJLJ_SETJMP) NODE_NAME_CASE(EH_SJLJ_LONGJMP) NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH) NODE_NAME_CASE(EH_RETURN) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(FNSTCW16m) NODE_NAME_CASE(FLDCW16m) NODE_NAME_CASE(FNSTENVm) NODE_NAME_CASE(FLDENVm) NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) NODE_NAME_CASE(LADD) NODE_NAME_CASE(LSUB) NODE_NAME_CASE(LOR) NODE_NAME_CASE(LXOR) NODE_NAME_CASE(LAND) NODE_NAME_CASE(LBTS) NODE_NAME_CASE(LBTC) NODE_NAME_CASE(LBTR) NODE_NAME_CASE(LBTS_RM) NODE_NAME_CASE(LBTC_RM) NODE_NAME_CASE(LBTR_RM) NODE_NAME_CASE(AADD) NODE_NAME_CASE(AOR) NODE_NAME_CASE(AXOR) NODE_NAME_CASE(AAND) NODE_NAME_CASE(VZEXT_MOVL) NODE_NAME_CASE(VZEXT_LOAD) NODE_NAME_CASE(VEXTRACT_STORE) NODE_NAME_CASE(VTRUNC) NODE_NAME_CASE(VTRUNCS) NODE_NAME_CASE(VTRUNCUS) NODE_NAME_CASE(VMTRUNC) NODE_NAME_CASE(VMTRUNCS) NODE_NAME_CASE(VMTRUNCUS) NODE_NAME_CASE(VTRUNCSTORES) NODE_NAME_CASE(VTRUNCSTOREUS) NODE_NAME_CASE(VMTRUNCSTORES) NODE_NAME_CASE(VMTRUNCSTOREUS) NODE_NAME_CASE(VFPEXT) NODE_NAME_CASE(STRICT_VFPEXT) NODE_NAME_CASE(VFPEXT_SAE) NODE_NAME_CASE(VFPEXTS) NODE_NAME_CASE(VFPEXTS_SAE) NODE_NAME_CASE(VFPROUND) NODE_NAME_CASE(STRICT_VFPROUND) NODE_NAME_CASE(VMFPROUND) NODE_NAME_CASE(VFPROUND_RND) NODE_NAME_CASE(VFPROUNDS) NODE_NAME_CASE(VFPROUNDS_RND) NODE_NAME_CASE(VSHLDQ) NODE_NAME_CASE(VSRLDQ) NODE_NAME_CASE(VSHL) NODE_NAME_CASE(VSRL) NODE_NAME_CASE(VSRA) NODE_NAME_CASE(VSHLI) NODE_NAME_CASE(VSRLI) NODE_NAME_CASE(VSRAI) NODE_NAME_CASE(VSHLV) NODE_NAME_CASE(VSRLV) NODE_NAME_CASE(VSRAV) NODE_NAME_CASE(VROTLI) NODE_NAME_CASE(VROTRI) NODE_NAME_CASE(VPPERM) NODE_NAME_CASE(CMPP) NODE_NAME_CASE(STRICT_CMPP) NODE_NAME_CASE(PCMPEQ) NODE_NAME_CASE(PCMPGT) NODE_NAME_CASE(PHMINPOS) NODE_NAME_CASE(ADD) NODE_NAME_CASE(SUB) NODE_NAME_CASE(ADC) NODE_NAME_CASE(SBB) NODE_NAME_CASE(SMUL) NODE_NAME_CASE(UMUL) NODE_NAME_CASE(OR) NODE_NAME_CASE(XOR) NODE_NAME_CASE(AND) NODE_NAME_CASE(BEXTR) NODE_NAME_CASE(BEXTRI) NODE_NAME_CASE(BZHI) NODE_NAME_CASE(PDEP) NODE_NAME_CASE(PEXT) NODE_NAME_CASE(MUL_IMM) NODE_NAME_CASE(MOVMSK) NODE_NAME_CASE(PTEST) NODE_NAME_CASE(TESTP) NODE_NAME_CASE(KORTEST) NODE_NAME_CASE(KTEST) NODE_NAME_CASE(KADD) NODE_NAME_CASE(KSHIFTL) NODE_NAME_CASE(KSHIFTR) NODE_NAME_CASE(PACKSS) NODE_NAME_CASE(PACKUS) NODE_NAME_CASE(PALIGNR) NODE_NAME_CASE(VALIGN) NODE_NAME_CASE(VSHLD) NODE_NAME_CASE(VSHRD) NODE_NAME_CASE(VSHLDV) NODE_NAME_CASE(VSHRDV) NODE_NAME_CASE(PSHUFD) NODE_NAME_CASE(PSHUFHW) NODE_NAME_CASE(PSHUFLW) NODE_NAME_CASE(SHUFP) NODE_NAME_CASE(SHUF128) NODE_NAME_CASE(MOVLHPS) NODE_NAME_CASE(MOVHLPS) NODE_NAME_CASE(MOVDDUP) NODE_NAME_CASE(MOVSHDUP) NODE_NAME_CASE(MOVSLDUP) NODE_NAME_CASE(MOVSD) NODE_NAME_CASE(MOVSS) NODE_NAME_CASE(MOVSH) NODE_NAME_CASE(UNPCKL) NODE_NAME_CASE(UNPCKH) NODE_NAME_CASE(VBROADCAST) NODE_NAME_CASE(VBROADCAST_LOAD) NODE_NAME_CASE(VBROADCASTM) NODE_NAME_CASE(SUBV_BROADCAST_LOAD) NODE_NAME_CASE(VPERMILPV) NODE_NAME_CASE(VPERMILPI) NODE_NAME_CASE(VPERM2X128) NODE_NAME_CASE(VPERMV) NODE_NAME_CASE(VPERMV3) NODE_NAME_CASE(VPERMI) NODE_NAME_CASE(VPTERNLOG) NODE_NAME_CASE(VFIXUPIMM) NODE_NAME_CASE(VFIXUPIMM_SAE) NODE_NAME_CASE(VFIXUPIMMS) NODE_NAME_CASE(VFIXUPIMMS_SAE) NODE_NAME_CASE(VRANGE) NODE_NAME_CASE(VRANGE_SAE) NODE_NAME_CASE(VRANGES) NODE_NAME_CASE(VRANGES_SAE) NODE_NAME_CASE(PMULUDQ) NODE_NAME_CASE(PMULDQ) NODE_NAME_CASE(PSADBW) NODE_NAME_CASE(DBPSADBW) NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) NODE_NAME_CASE(VAARG_64) NODE_NAME_CASE(VAARG_X32) NODE_NAME_CASE(DYN_ALLOCA) NODE_NAME_CASE(MFENCE) NODE_NAME_CASE(SEG_ALLOCA) NODE_NAME_CASE(PROBED_ALLOCA) NODE_NAME_CASE(RDRAND) NODE_NAME_CASE(RDSEED) NODE_NAME_CASE(RDPKRU) NODE_NAME_CASE(WRPKRU) NODE_NAME_CASE(VPMADDUBSW) NODE_NAME_CASE(VPMADDWD) NODE_NAME_CASE(VPSHA) NODE_NAME_CASE(VPSHL) NODE_NAME_CASE(VPCOM) NODE_NAME_CASE(VPCOMU) NODE_NAME_CASE(VPERMIL2) NODE_NAME_CASE(FMSUB) NODE_NAME_CASE(STRICT_FMSUB) NODE_NAME_CASE(FNMADD) NODE_NAME_CASE(STRICT_FNMADD) NODE_NAME_CASE(FNMSUB) NODE_NAME_CASE(STRICT_FNMSUB) NODE_NAME_CASE(FMADDSUB) NODE_NAME_CASE(FMSUBADD) NODE_NAME_CASE(FMADD_RND) NODE_NAME_CASE(FNMADD_RND) NODE_NAME_CASE(FMSUB_RND) NODE_NAME_CASE(FNMSUB_RND) NODE_NAME_CASE(FMADDSUB_RND) NODE_NAME_CASE(FMSUBADD_RND) NODE_NAME_CASE(VFMADDC) NODE_NAME_CASE(VFMADDC_RND) NODE_NAME_CASE(VFCMADDC) NODE_NAME_CASE(VFCMADDC_RND) NODE_NAME_CASE(VFMULC) NODE_NAME_CASE(VFMULC_RND) NODE_NAME_CASE(VFCMULC) NODE_NAME_CASE(VFCMULC_RND) NODE_NAME_CASE(VFMULCSH) NODE_NAME_CASE(VFMULCSH_RND) NODE_NAME_CASE(VFCMULCSH) NODE_NAME_CASE(VFCMULCSH_RND) NODE_NAME_CASE(VFMADDCSH) NODE_NAME_CASE(VFMADDCSH_RND) NODE_NAME_CASE(VFCMADDCSH) NODE_NAME_CASE(VFCMADDCSH_RND) NODE_NAME_CASE(VPMADD52H) NODE_NAME_CASE(VPMADD52L) NODE_NAME_CASE(VRNDSCALE) NODE_NAME_CASE(STRICT_VRNDSCALE) NODE_NAME_CASE(VRNDSCALE_SAE) NODE_NAME_CASE(VRNDSCALES) NODE_NAME_CASE(VRNDSCALES_SAE) NODE_NAME_CASE(VREDUCE) NODE_NAME_CASE(VREDUCE_SAE) NODE_NAME_CASE(VREDUCES) NODE_NAME_CASE(VREDUCES_SAE) NODE_NAME_CASE(VGETMANT) NODE_NAME_CASE(VGETMANT_SAE) NODE_NAME_CASE(VGETMANTS) NODE_NAME_CASE(VGETMANTS_SAE) NODE_NAME_CASE(PCMPESTR) NODE_NAME_CASE(PCMPISTR) NODE_NAME_CASE(XTEST) NODE_NAME_CASE(COMPRESS) NODE_NAME_CASE(EXPAND) NODE_NAME_CASE(SELECTS) NODE_NAME_CASE(ADDSUB) NODE_NAME_CASE(RCP14) NODE_NAME_CASE(RCP14S) NODE_NAME_CASE(RSQRT14) NODE_NAME_CASE(RSQRT14S) NODE_NAME_CASE(FADD_RND) NODE_NAME_CASE(FADDS) NODE_NAME_CASE(FADDS_RND) NODE_NAME_CASE(FSUB_RND) NODE_NAME_CASE(FSUBS) NODE_NAME_CASE(FSUBS_RND) NODE_NAME_CASE(FMUL_RND) NODE_NAME_CASE(FMULS) NODE_NAME_CASE(FMULS_RND) NODE_NAME_CASE(FDIV_RND) NODE_NAME_CASE(FDIVS) NODE_NAME_CASE(FDIVS_RND) NODE_NAME_CASE(FSQRT_RND) NODE_NAME_CASE(FSQRTS) NODE_NAME_CASE(FSQRTS_RND) NODE_NAME_CASE(FGETEXP) NODE_NAME_CASE(FGETEXP_SAE) NODE_NAME_CASE(FGETEXPS) NODE_NAME_CASE(FGETEXPS_SAE) NODE_NAME_CASE(SCALEF) NODE_NAME_CASE(SCALEF_RND) NODE_NAME_CASE(SCALEFS) NODE_NAME_CASE(SCALEFS_RND) NODE_NAME_CASE(MULHRS) NODE_NAME_CASE(SINT_TO_FP_RND) NODE_NAME_CASE(UINT_TO_FP_RND) NODE_NAME_CASE(CVTTP2SI) NODE_NAME_CASE(CVTTP2UI) NODE_NAME_CASE(STRICT_CVTTP2SI) NODE_NAME_CASE(STRICT_CVTTP2UI) NODE_NAME_CASE(MCVTTP2SI) NODE_NAME_CASE(MCVTTP2UI) NODE_NAME_CASE(CVTTP2SI_SAE) NODE_NAME_CASE(CVTTP2UI_SAE) NODE_NAME_CASE(CVTTS2SI) NODE_NAME_CASE(CVTTS2UI) NODE_NAME_CASE(CVTTS2SI_SAE) NODE_NAME_CASE(CVTTS2UI_SAE) NODE_NAME_CASE(CVTSI2P) NODE_NAME_CASE(CVTUI2P) NODE_NAME_CASE(STRICT_CVTSI2P) NODE_NAME_CASE(STRICT_CVTUI2P) NODE_NAME_CASE(MCVTSI2P) NODE_NAME_CASE(MCVTUI2P) NODE_NAME_CASE(VFPCLASS) NODE_NAME_CASE(VFPCLASSS) NODE_NAME_CASE(MULTISHIFT) NODE_NAME_CASE(SCALAR_SINT_TO_FP) NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND) NODE_NAME_CASE(SCALAR_UINT_TO_FP) NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) NODE_NAME_CASE(CVTPS2PH) NODE_NAME_CASE(STRICT_CVTPS2PH) NODE_NAME_CASE(CVTPS2PH_SAE) NODE_NAME_CASE(MCVTPS2PH) NODE_NAME_CASE(MCVTPS2PH_SAE) NODE_NAME_CASE(CVTPH2PS) NODE_NAME_CASE(STRICT_CVTPH2PS) NODE_NAME_CASE(CVTPH2PS_SAE) NODE_NAME_CASE(CVTP2SI) NODE_NAME_CASE(CVTP2UI) NODE_NAME_CASE(MCVTP2SI) NODE_NAME_CASE(MCVTP2UI) NODE_NAME_CASE(CVTP2SI_RND) NODE_NAME_CASE(CVTP2UI_RND) NODE_NAME_CASE(CVTS2SI) NODE_NAME_CASE(CVTS2UI) NODE_NAME_CASE(CVTS2SI_RND) NODE_NAME_CASE(CVTS2UI_RND) NODE_NAME_CASE(CVTNE2PS2BF16) NODE_NAME_CASE(CVTNEPS2BF16) NODE_NAME_CASE(MCVTNEPS2BF16) NODE_NAME_CASE(DPBF16PS) NODE_NAME_CASE(LWPINS) NODE_NAME_CASE(MGATHER) NODE_NAME_CASE(MSCATTER) NODE_NAME_CASE(VPDPBUSD) NODE_NAME_CASE(VPDPBUSDS) NODE_NAME_CASE(VPDPWSSD) NODE_NAME_CASE(VPDPWSSDS) NODE_NAME_CASE(VPSHUFBITQMB) NODE_NAME_CASE(GF2P8MULB) NODE_NAME_CASE(GF2P8AFFINEQB) NODE_NAME_CASE(GF2P8AFFINEINVQB) NODE_NAME_CASE(NT_CALL) NODE_NAME_CASE(NT_BRIND) NODE_NAME_CASE(UMWAIT) NODE_NAME_CASE(TPAUSE) NODE_NAME_CASE(ENQCMD) NODE_NAME_CASE(ENQCMDS) NODE_NAME_CASE(VP2INTERSECT) NODE_NAME_CASE(VPDPBSUD) NODE_NAME_CASE(VPDPBSUDS) NODE_NAME_CASE(VPDPBUUD) NODE_NAME_CASE(VPDPBUUDS) NODE_NAME_CASE(VPDPBSSD) NODE_NAME_CASE(VPDPBSSDS) NODE_NAME_CASE(AESENC128KL) NODE_NAME_CASE(AESDEC128KL) NODE_NAME_CASE(AESENC256KL) NODE_NAME_CASE(AESDEC256KL) NODE_NAME_CASE(AESENCWIDE128KL) NODE_NAME_CASE(AESDECWIDE128KL) NODE_NAME_CASE(AESENCWIDE256KL) NODE_NAME_CASE(AESDECWIDE256KL) NODE_NAME_CASE(CMPCCXADD) NODE_NAME_CASE(TESTUI) NODE_NAME_CASE(FP80_ADD) NODE_NAME_CASE(STRICT_FP80_ADD) NODE_NAME_CASE(CCMP) NODE_NAME_CASE(CTEST) NODE_NAME_CASE(CLOAD) NODE_NAME_CASE(CSTORE) } return nullptr; #undef NODE_NAME_CASE } /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); // X86 allows a sign-extended 32-bit immediate field as a displacement. if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) return false; if (AM.BaseGV) { unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV); // If a reference to this global requires an extra load, we can't fold it. if (isGlobalStubReference(GVFlags)) return false; // If BaseGV requires a register for the PIC base, we cannot also have a // BaseReg specified. if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) return false; // If lower 4G is not available, then we must use rip-relative addressing. if ((M != CodeModel::Small || isPositionIndependent()) && Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1)) return false; } switch (AM.Scale) { case 0: case 1: case 2: case 4: case 8: // These scales always work. break; case 3: case 5: case 9: // These scales are formed with basereg+scalereg. Only accept if there is // no basereg yet. if (AM.HasBaseReg) return false; break; default: // Other stuff never works. return false; } return true; } bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { unsigned Bits = Ty->getScalarSizeInBits(); // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. if (Subtarget.hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) return false; // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable // shifts just as cheap as scalar ones. if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) return false; // AVX512BW has shifts such as vpsllvw. if (Subtarget.hasBWI() && Bits == 16) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a // fully general vector. return true; } bool X86TargetLowering::isBinOp(unsigned Opcode) const { switch (Opcode) { // These are non-commutative binops. // TODO: Add more X86ISD opcodes once we have test coverage. case X86ISD::ANDNP: case X86ISD::PCMPGT: case X86ISD::FMAX: case X86ISD::FMIN: case X86ISD::FANDN: case X86ISD::VPSHA: case X86ISD::VPSHL: case X86ISD::VSHLV: case X86ISD::VSRLV: case X86ISD::VSRAV: return true; } return TargetLoweringBase::isBinOp(Opcode); } bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { switch (Opcode) { // TODO: Add more X86ISD opcodes once we have test coverage. case X86ISD::PCMPEQ: case X86ISD::PMULDQ: case X86ISD::PMULUDQ: case X86ISD::FMAXC: case X86ISD::FMINC: case X86ISD::FAND: case X86ISD::FOR: case X86ISD::FXOR: return true; } return TargetLoweringBase::isCommutativeBinOp(Opcode); } bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { // Can also use sub to handle negated immediates. return isInt<32>(Imm); } bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const { return isInt<32>(Imm); } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (!VT1.isScalarInteger() || !VT2.isScalarInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit(); } bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) return true; if (Val.getOpcode() != ISD::LOAD) return false; if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i8: case MVT::i16: case MVT::i32: // X86 has 8, 16, and 32-bit zero-extending loads. return true; } return false; } bool X86TargetLowering::shouldSinkOperands(Instruction *I, SmallVectorImpl &Ops) const { using namespace llvm::PatternMatch; FixedVectorType *VTy = dyn_cast(I->getType()); if (!VTy) return false; if (I->getOpcode() == Instruction::Mul && VTy->getElementType()->isIntegerTy(64)) { for (auto &Op : I->operands()) { // Make sure we are not already sinking this operand if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) continue; // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or // the PMULUDQ pattern where the input is a zext_inreg from vXi32. if (Subtarget.hasSSE41() && match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), m_SpecificInt(32)))) { Ops.push_back(&cast(Op)->getOperandUse(0)); Ops.push_back(&Op); } else if (Subtarget.hasSSE2() && match(Op.get(), m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) { Ops.push_back(&Op); } } return !Ops.empty(); } // A uniform shift amount in a vector shift or funnel shift may be much // cheaper than a generic variable vector shift, so make that pattern visible // to SDAG by sinking the shuffle instruction next to the shift. int ShiftAmountOpNum = -1; if (I->isShift()) ShiftAmountOpNum = 1; else if (auto *II = dyn_cast(I)) { if (II->getIntrinsicID() == Intrinsic::fshl || II->getIntrinsicID() == Intrinsic::fshr) ShiftAmountOpNum = 2; } if (ShiftAmountOpNum == -1) return false; auto *Shuf = dyn_cast(I->getOperand(ShiftAmountOpNum)); if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && isVectorShiftByScalarCheap(I->getType())) { Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); return true; } return false; } bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const { if (!Subtarget.is64Bit()) return false; return TargetLowering::shouldConvertPhiType(From, To); } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { if (isa(ExtVal.getOperand(0))) return false; EVT SrcVT = ExtVal.getOperand(0).getValueType(); // There is no extending load for vXi1. if (SrcVT.getScalarType() == MVT::i1) return false; return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { if (!Subtarget.hasAnyFMA()) return false; VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f16: return Subtarget.hasFP16(); case MVT::f32: case MVT::f64: return true; default: break; } return false; } bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(SrcVT == MVT::i32 && DestVT == MVT::i16); } bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode, EVT VT) const { // TODO: This is too general. There are cases where pre-AVX512 codegen would // benefit. The transform may also be profitable for scalar code. if (!Subtarget.hasAVX512()) return false; if (!Subtarget.hasVLX() && !VT.is512BitVector()) return false; if (!VT.isVector() || VT.getScalarType() == MVT::i1) return false; return true; } /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool X86TargetLowering::isShuffleMaskLegal(ArrayRef Mask, EVT VT) const { if (!VT.isSimple()) return false; // Not for i1 vectors if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can // handle any possible shuffle mask that results. return isTypeLegal(VT.getSimpleVT()); } bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef Mask, EVT VT) const { // Don't convert an 'and' into a shuffle that we don't directly support. // vpblendw and vpshufb for 256-bit vectors are not available on AVX1. if (!Subtarget.hasAVX2()) if (VT == MVT::v32i8 || VT == MVT::v16i16) return false; // Just delegate to the generic legality, clear masks aren't special. return isShuffleMaskLegal(Mask, VT); } bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { // If the subtarget is using thunks, we need to not generate jump tables. if (Subtarget.useIndirectThunkBranches()) return false; // Otherwise, fallback on the generic logic. return TargetLowering::areJTsAllowed(Fn); } MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const { // Avoid 8 and 16 bit types because they increase the chance for unnecessary // zero-extensions. if (ConditionVT.getSizeInBits() < 32) return MVT::i32; return TargetLoweringBase::getPreferredSwitchConditionType(Context, ConditionVT); } //===----------------------------------------------------------------------===// // X86 Scheduler Hooks //===----------------------------------------------------------------------===// // Returns true if EFLAG is consumed after this iterator in the rest of the // basic block or any successors of the basic block. static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB) { // Scan forward through BB for a use/def of EFLAGS. for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) { if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr)) return true; // If we found a def, we can stop searching. if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) return false; } // If we hit the end of the block, check whether EFLAGS is live into a // successor. for (MachineBasicBlock *Succ : BB->successors()) if (Succ->isLiveIn(X86::EFLAGS)) return true; return false; } /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { const MIMetadata MIMD(MI); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // // thisMBB: // xbegin sinkMBB // // mainMBB: // s0 = -1 // // fallBB: // eax = # XABORT_DEF // s1 = eax // // sinkMBB: // v = phi(s0/mainBB, s1/fallBB) MachineBasicBlock *thisMBB = MBB; MachineFunction *MF = MBB->getParent(); MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, fallMBB); MF->insert(I, sinkMBB); if (isEFLAGSLiveAfter(MI, MBB)) { mainMBB->addLiveIn(X86::EFLAGS); fallMBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MachineRegisterInfo &MRI = MF->getRegInfo(); Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); Register mainDstReg = MRI.createVirtualRegister(RC); Register fallDstReg = MRI.createVirtualRegister(RC); // thisMBB: // xbegin fallMBB // # fallthrough to mainMBB // # abortion to fallMBB BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(fallMBB); // mainMBB: // mainDstReg := -1 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB); mainMBB->addSuccessor(sinkMBB); // fallMBB: // ; pseudo instruction to model hardware's definition from XABORT // EAX := XABORT_DEF // fallDstReg := EAX BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF)); BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg) .addReg(X86::EAX); fallMBB->addSuccessor(sinkMBB); // sinkMBB: // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg) .addReg(mainDstReg).addMBB(mainMBB) .addReg(fallDstReg).addMBB(fallMBB); MI.eraseFromParent(); return sinkMBB; } MachineBasicBlock * X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { // Emit va_arg instruction on X86-64. // Operands to this pseudo-instruction: // 0 ) Output : destination address (reg) // 1-5) Input : va_list address (addr, i64mem) // 6 ) ArgSize : Size (in bytes) of vararg type // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset // 8 ) Align : Alignment of type // 9 ) EFLAGS (implicit-def) assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!"); static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands"); Register DestReg = MI.getOperand(0).getReg(); MachineOperand &Base = MI.getOperand(1); MachineOperand &Scale = MI.getOperand(2); MachineOperand &Index = MI.getOperand(3); MachineOperand &Disp = MI.getOperand(4); MachineOperand &Segment = MI.getOperand(5); unsigned ArgSize = MI.getOperand(6).getImm(); unsigned ArgMode = MI.getOperand(7).getImm(); Align Alignment = Align(MI.getOperand(8).getImm()); MachineFunction *MF = MBB->getParent(); // Memory Reference assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"); MachineMemOperand *OldMMO = MI.memoperands().front(); // Clone the MMO into two separate MMOs for loading and storing MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand( OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore); MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand( OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad); // Machine Information const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout())); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); const MIMetadata MIMD(MI); // struct va_list { // i32 gp_offset // i32 fp_offset // i64 overflow_area (address) // i64 reg_save_area (address) // } // sizeof(va_list) = 24 // alignment(va_list) = 8 unsigned TotalNumIntRegs = 6; unsigned TotalNumXMMRegs = 8; bool UseGPOffset = (ArgMode == 1); bool UseFPOffset = (ArgMode == 2); unsigned MaxOffset = TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; bool NeedsAlign = (Alignment > 8); MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *overflowMBB; MachineBasicBlock *offsetMBB; MachineBasicBlock *endMBB; unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB unsigned OffsetReg = 0; if (!UseGPOffset && !UseFPOffset) { // If we only pull from the overflow region, we don't create a branch. // We don't need to alter control flow. OffsetDestReg = 0; // unused OverflowDestReg = DestReg; offsetMBB = nullptr; overflowMBB = thisMBB; endMBB = thisMBB; } else { // First emit code to check if gp_offset (or fp_offset) is below the bound. // If so, pull the argument from reg_save_area. (branch to offsetMBB) // If not, pull from overflow_area. (branch to overflowMBB) // // thisMBB // | . // | . // offsetMBB overflowMBB // | . // | . // endMBB // Registers for the PHI in endMBB OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); MF->insert(MBBIter, overflowMBB); MF->insert(MBBIter, endMBB); // Transfer the remainder of MBB and its successor edges to endMBB. endMBB->splice(endMBB->begin(), thisMBB, std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); // Make offsetMBB and overflowMBB successors of thisMBB thisMBB->addSuccessor(offsetMBB); thisMBB->addSuccessor(overflowMBB); // endMBB is a successor of both offsetMBB and overflowMBB offsetMBB->addSuccessor(endMBB); overflowMBB->addSuccessor(endMBB); // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .setMemRefs(LoadOnlyMMO); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri)) .addReg(OffsetReg) .addImm(MaxOffset + 8 - ArgSizeA8); // Branch to "overflowMBB" if offset >= max // Fall through to "offsetMBB" otherwise BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(overflowMBB).addImm(X86::COND_AE); } // In offsetMBB, emit code to use the reg_save_area. if (offsetMBB) { assert(OffsetReg != 0); // Read the reg_save_area address. Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI( offsetMBB, MIMD, TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), RegSaveReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12) .add(Segment) .setMemRefs(LoadOnlyMMO); if (Subtarget.isTarget64BitLP64()) { // Zero-extend the offset Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); // Add the offset to the reg_save_area to get the final address. BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg) .addReg(OffsetReg64) .addReg(RegSaveReg); } else { // Add the offset to the reg_save_area to get the final address. BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg) .addReg(OffsetReg) .addReg(RegSaveReg); } // Compute the offset for the next argument Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); // Store it back into the va_list. BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr)) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, UseFPOffset ? 4 : 0) .add(Segment) .addReg(NextOffsetReg) .setMemRefs(StoreOnlyMMO); // Jump to endMBB BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1)) .addMBB(endMBB); } // // Emit code to use overflow area // // Load the overflow_area address into a register. Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, MIMD, TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), OverflowAddrReg) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 8) .add(Segment) .setMemRefs(LoadOnlyMMO); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address Register TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI( overflowMBB, MIMD, TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), TmpReg) .addReg(OverflowAddrReg) .addImm(Alignment.value() - 1); BuildMI( overflowMBB, MIMD, TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri), OverflowDestReg) .addReg(TmpReg) .addImm(~(uint64_t)(Alignment.value() - 1)); } else { BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg) .addReg(OverflowAddrReg); } // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI( overflowMBB, MIMD, TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); // Store the new overflow address. BuildMI(overflowMBB, MIMD, TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr)) .add(Base) .add(Scale) .add(Index) .addDisp(Disp, 8) .add(Segment) .addReg(NextAddrReg) .setMemRefs(StoreOnlyMMO); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { BuildMI(*endMBB, endMBB->begin(), MIMD, TII->get(X86::PHI), DestReg) .addReg(OffsetDestReg).addMBB(offsetMBB) .addReg(OverflowDestReg).addMBB(overflowMBB); } // Erase the pseudo instruction MI.eraseFromParent(); return endMBB; } // The EFLAGS operand of SelectItr might be missing a kill marker // because there were multiple uses of EFLAGS, and ISel didn't know // which to mark. Figure out whether SelectItr should have had a // kill marker, and set it if it should. Returns the correct kill // marker value. static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { if (isEFLAGSLiveAfter(SelectItr, BB)) return false; // We found a def, or hit the end of the basic block and EFLAGS wasn't live // out. SelectMI should have a kill flag on EFLAGS. SelectItr->addRegisterKilled(X86::EFLAGS, TRI); return true; } // Return true if it is OK for this CMOV pseudo-opcode to be cascaded // together with other CMOV pseudo-opcodes into a single basic-block with // conditional jump around it. static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CMOV_FR16: case X86::CMOV_FR16X: case X86::CMOV_FR32: case X86::CMOV_FR32X: case X86::CMOV_FR64: case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_VR64: case X86::CMOV_VR128: case X86::CMOV_VR128X: case X86::CMOV_VR256: case X86::CMOV_VR256X: case X86::CMOV_VR512: case X86::CMOV_VK1: case X86::CMOV_VK2: case X86::CMOV_VK4: case X86::CMOV_VK8: case X86::CMOV_VK16: case X86::CMOV_VK32: case X86::CMOV_VK64: return true; default: return false; } } // Helper function, which inserts PHI functions into SinkMBB: // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ], // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for // the last PHI function inserted. static MachineInstrBuilder createPHIsForCMOVsInSinkBB( MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB) { MachineFunction *MF = TrueMBB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); const MIMetadata MIMD(*MIItBegin); X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); // As we are creating the PHIs, we have to be careful if there is more than // one. Later CMOVs may reference the results of earlier CMOVs, but later // PHIs have to reference the individual true/false inputs from earlier PHIs. // That also means that PHI construction must work forward from earlier to // later, and that the code must maintain a mapping from earlier PHI's // destination registers, and the registers that went into the PHI. DenseMap> RegRewriteTable; MachineInstrBuilder MIB; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { Register DestReg = MIIt->getOperand(0).getReg(); Register Op1Reg = MIIt->getOperand(1).getReg(); Register Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the // PHI that is going to be generated. if (MIIt->getOperand(3).getImm() == OppCC) std::swap(Op1Reg, Op2Reg); if (RegRewriteTable.contains(Op1Reg)) Op1Reg = RegRewriteTable[Op1Reg].first; if (RegRewriteTable.contains(Op2Reg)) Op2Reg = RegRewriteTable[Op2Reg].second; MIB = BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) .addMBB(FalseMBB) .addReg(Op2Reg) .addMBB(TrueMBB); // Add this PHI to the rewrite table. RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); } return MIB; } // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2). MachineBasicBlock * X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, MachineInstr &SecondCascadedCMOV, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const MIMetadata MIMD(FirstCMOV); // We lower cascaded CMOVs such as // // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2) // // to two successive branches. // // Without this, we would add a PHI between the two jumps, which ends up // creating a few copies all around. For instance, for // // (sitofp (zext (fcmp une))) // // we would generate: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // movaps %xmm0, %xmm1 // jne .LBB5_2 // xorps %xmm1, %xmm1 // .LBB5_2: // jp .LBB5_4 // movaps %xmm1, %xmm0 // .LBB5_4: // retq // // because this custom-inserter would have generated: // // A // | \ // | B // | / // C // | \ // | D // | / // E // // A: X = ...; Y = ... // B: empty // C: Z = PHI [X, A], [Y, B] // D: empty // E: PHI [X, C], [Z, D] // // If we lower both CMOVs in a single step, we can instead generate: // // A // | \ // | C // | /| // |/ | // | | // | D // | / // E // // A: X = ...; Y = ... // D: empty // E: PHI [X, A], [X, C], [Y, D] // // Which, in our sitofp/fcmp example, gives us something like: // // ucomiss %xmm1, %xmm0 // movss <1.0f>, %xmm0 // jne .LBB5_4 // jp .LBB5_4 // xorps %xmm0, %xmm0 // .LBB5_4: // retq // // We lower cascaded CMOV into two successive branches to the same block. // EFLAGS is used by both, so mark it as live in the second. const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); MachineFunction *F = ThisMBB->getParent(); MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++ThisMBB->getIterator(); F->insert(It, FirstInsertedMBB); F->insert(It, SecondInsertedMBB); F->insert(It, SinkMBB); // For a cascaded CMOV, we lower it to two successive branches to // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in // the FirstInsertedMBB. FirstInsertedMBB->addLiveIn(X86::EFLAGS); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) && !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) { SecondInsertedMBB->addLiveIn(X86::EFLAGS); SinkMBB->addLiveIn(X86::EFLAGS); } // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->begin(), ThisMBB, std::next(MachineBasicBlock::iterator(FirstCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); // Fallthrough block for ThisMBB. ThisMBB->addSuccessor(FirstInsertedMBB); // The true block target of the first branch is always SinkMBB. ThisMBB->addSuccessor(SinkMBB); // Fallthrough block for FirstInsertedMBB. FirstInsertedMBB->addSuccessor(SecondInsertedMBB); // The true block for the branch of FirstInsertedMBB. FirstInsertedMBB->addSuccessor(SinkMBB); // This is fallthrough. SecondInsertedMBB->addSuccessor(SinkMBB); // Create the conditional branch instructions. X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC); X86::CondCode SecondCC = X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(SinkMBB) .addImm(SecondCC); // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] Register DestReg = SecondCascadedCMOV.getOperand(0).getReg(); Register Op1Reg = FirstCMOV.getOperand(1).getReg(); Register Op2Reg = FirstCMOV.getOperand(2).getReg(); MachineInstrBuilder MIB = BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) .addMBB(SecondInsertedMBB) .addReg(Op2Reg) .addMBB(ThisMBB); // The second SecondInsertedMBB provides the same incoming value as the // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes). MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB); // Now remove the CMOVs. FirstCMOV.eraseFromParent(); SecondCascadedCMOV.eraseFromParent(); return SinkMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, MachineBasicBlock *ThisMBB) const { const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const MIMetadata MIMD(MI); // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between and a branch opcode to use. // ThisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> FalseMBB // This code lowers all pseudo-CMOV instructions. Generally it lowers these // as described above, by inserting a BB, and then making a PHI at the join // point to select the true and false operands of the CMOV in the PHI. // // The code also handles two different cases of multiple CMOV opcodes // in a row. // // Case 1: // In this case, there are multiple CMOVs in a row, all which are based on // the same condition setting (or the exact opposite condition setting). // In this case we can lower all the CMOVs using a single inserted BB, and // then make a number of PHIs at the join point to model the CMOVs. The only // trickiness here, is that in a case like: // // t2 = CMOV cond1 t1, f1 // t3 = CMOV cond1 t2, f2 // // when rewriting this into PHIs, we have to perform some renaming on the // temps since you cannot have a PHI operand refer to a PHI result earlier // in the same block. The "simple" but wrong lowering would be: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t2(BB1), f2(BB2) // // but clearly t2 is not defined in BB1, so that is incorrect. The proper // renaming is to note that on the path through BB1, t2 is really just a // copy of t1, and do that renaming, properly generating: // // t2 = PHI t1(BB1), f1(BB2) // t3 = PHI t1(BB1), f2(BB2) // // Case 2: // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate // function - EmitLoweredCascadedSelect. X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineInstr *LastCMOV = &MI; MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI); // Check for case 1, where there are multiple CMOVs with the same condition // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the // number of jumps the most. if (isCMOVPseudo(MI)) { // See if we have a string of CMOVS with the same condition. Skip over // intervening debug insts. while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; NextMIIt = next_nodbg(NextMIIt, ThisMBB->end()); } } // This checks for case 2, but only do this if we didn't already find // case 1, as indicated by LastCMOV == MI. if (LastCMOV == &MI && NextMIIt != ThisMBB->end() && NextMIIt->getOpcode() == MI.getOpcode() && NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && NextMIIt->getOperand(1).isKill()) { return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB); } const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); MachineFunction *F = ThisMBB->getParent(); MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++ThisMBB->getIterator(); F->insert(It, FalseMBB); F->insert(It, SinkMBB); // Set the call frame size on entry to the new basic blocks. unsigned CallFrameSize = TII->getCallFrameSizeAt(MI); FalseMBB->setCallFrameSize(CallFrameSize); SinkMBB->setCallFrameSize(CallFrameSize); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) && !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) { FalseMBB->addLiveIn(X86::EFLAGS); SinkMBB->addLiveIn(X86::EFLAGS); } // Transfer any debug instructions inside the CMOV sequence to the sunk block. auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI), MachineBasicBlock::iterator(LastCMOV)); for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange)) if (MI.isDebugInstr()) SinkMBB->push_back(MI.removeFromParent()); // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. SinkMBB->splice(SinkMBB->end(), ThisMBB, std::next(MachineBasicBlock::iterator(LastCMOV)), ThisMBB->end()); SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); // Fallthrough block for ThisMBB. ThisMBB->addSuccessor(FalseMBB); // The true block target of the first (or only) branch is always a SinkMBB. ThisMBB->addSuccessor(SinkMBB); // Fallthrough block for FalseMBB. FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); // SinkMBB: // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] // ... MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = std::next(MachineBasicBlock::iterator(LastCMOV)); createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB); // Now remove the CMOV(s). ThisMBB->erase(MIItBegin, MIItEnd); return SinkMBB; } static unsigned getSUBriOpcode(bool IsLP64) { if (IsLP64) return X86::SUB64ri32; else return X86::SUB32ri; } MachineBasicBlock * X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, MachineBasicBlock *MBB) const { MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); const MIMetadata MIMD(MI); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); const unsigned ProbeSize = getStackProbeSize(*MF); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator MBBIter = ++MBB->getIterator(); MF->insert(MBBIter, testMBB); MF->insert(MBBIter, blockMBB); MF->insert(MBBIter, tailMBB); Register sizeVReg = MI.getOperand(1).getReg(); Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; Register TmpStackPtr = MRI.createVirtualRegister( TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); Register FinalStackPtr = MRI.createVirtualRegister( TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr) .addReg(physSPReg); { const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr; BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr) .addReg(TmpStackPtr) .addReg(sizeVReg); } // test rsp size BuildMI(testMBB, MIMD, TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) .addReg(FinalStackPtr) .addReg(physSPReg); BuildMI(testMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(tailMBB) .addImm(X86::COND_GE); testMBB->addSuccessor(blockMBB); testMBB->addSuccessor(tailMBB); // Touch the block then extend it. This is done on the opposite side of // static probe where we allocate then touch, to avoid the need of probing the // tail of the static alloca. Possible scenarios are: // // + ---- <- ------------ <- ------------- <- ------------ + // | | // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] + // | | // + <- ----------- <- ------------ <- ----------- <- ------------ + // // The property we want to enforce is to never have more than [page alloc] between two probes. const unsigned XORMIOpc = TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi; addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0) .addImm(0); BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)), physSPReg) .addReg(physSPReg) .addImm(ProbeSize); BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB); blockMBB->addSuccessor(testMBB); // Replace original instruction by the expected stack ptr BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(FinalStackPtr); tailMBB->splice(tailMBB->end(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); tailMBB->transferSuccessorsAndUpdatePHIs(MBB); MBB->addSuccessor(testMBB); // Delete the original pseudo instruction. MI.eraseFromParent(); // And we're done. return tailMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const MIMetadata MIMD(MI); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); const bool Is64Bit = Subtarget.is64Bit(); const bool IsLP64 = Subtarget.isTarget64BitLP64(); const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] // If stacklet is not large enough, jump to mallocMBB // // bumpMBB: // Allocate by subtracting from RSP // Jump to continueMBB // // mallocMBB: // Allocate by call to runtime // // continueMBB: // ... // [rest of original BB] // MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(getPointerTy(MF->getDataLayout())); Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI.getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); MF->insert(MBBIter, continueMBB); continueMBB->splice(continueMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); continueMBB->transferSuccessorsAndUpdatePHIs(BB); // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. const uint32_t *RegMask = Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); if (IsLP64) { BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); } else if (Is64Bit) { BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI) .addReg(sizeVReg); BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EDI, RegState::Implicit) .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg); BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32)) .addExternalSymbol("__morestack_allocate_stack_space") .addRegMask(RegMask) .addReg(X86::EAX, RegState::ImplicitDefine); } if (!Is64Bit) BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) .addImm(16); BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg) .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); BB->addSuccessor(mallocMBB); mallocMBB->addSuccessor(continueMBB); bumpMBB->addSuccessor(continueMBB); // Take care of the PHI nodes. BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI), MI.getOperand(0).getReg()) .addReg(mallocPtrVReg) .addMBB(mallocMBB) .addReg(bumpSPPtrVReg) .addMBB(bumpMBB); // Delete the original pseudo instruction. MI.eraseFromParent(); // And we're done. return continueMBB; } MachineBasicBlock * X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); const MIMetadata MIMD(MI); assert(!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. if (!Subtarget.is32Bit()) return BB; // C++ EH creates a new target block to hold the restore code, and wires up // the new block to the return destination with a normal JMP_4. MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB->getBasicBlock()); assert(BB->succ_size() == 1); MF->insert(std::next(BB->getIterator()), RestoreMBB); RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RestoreMBB); MI.getOperand(0).setMBB(RestoreMBB); // Marking this as an EH pad but not a funclet entry block causes PEI to // restore stack pointers in the block. RestoreMBB->setIsEHPad(true); auto RestoreMBBI = RestoreMBB->begin(); BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const { // So, here we replace TLSADDR with the sequence: // adjust_stackdown -> TLSADDR -> adjust_stackup. // We need this because TLSADDR is lowered into calls // inside MC, therefore without the two markers shrink-wrapping // may push the prologue/epilogue pass them. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MIMetadata MIMD(MI); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. MF.getFrameInfo().setAdjustsStack(true); unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. // We don't call erase from parent because we want to keep the // original instruction around. unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); MachineInstrBuilder CallseqEnd = BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0); BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); return BB; } MachineBasicBlock * X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, MachineBasicBlock *BB) const { // This is pretty easy. We're taking the value that we received from // our load from the relocation, sticking it in either RDI (x86-64) // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); const MIMetadata MIMD(MI); assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?"); assert(MI.getOperand(3).isGlobal() && "This should be a global"); // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = Subtarget.is64Bit() ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget.is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI) .addReg(X86::RIP) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m)); addDirectMem(MIB, X86::RDI); MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); } else if (!isPositionIndependent()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX) .addReg(0) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } else { MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX) .addReg(TII->getGlobalBaseReg(F)) .addImm(0) .addReg(0) .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, MI.getOperand(3).getTargetFlags()) .addReg(0); MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m)); addDirectMem(MIB, X86::EAX); MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); } MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) { switch (RPOpc) { case X86::INDIRECT_THUNK_CALL32: return X86::CALLpcrel32; case X86::INDIRECT_THUNK_CALL64: return X86::CALL64pcrel32; case X86::INDIRECT_THUNK_TCRETURN32: return X86::TCRETURNdi; case X86::INDIRECT_THUNK_TCRETURN64: return X86::TCRETURNdi64; } llvm_unreachable("not indirect thunk opcode"); } static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg) { if (Subtarget.useRetpolineExternalThunk()) { // When using an external thunk for retpolines, we pick names that match the // names GCC happens to use as well. This helps simplify the implementation // of the thunks for kernels where they have no easy ability to create // aliases and are doing non-trivial configuration of the thunk's body. For // example, the Linux kernel will do boot-time hot patching of the thunk // bodies and cannot easily export aliases of these to loaded modules. // // Note that at any point in the future, we may need to change the semantics // of how we implement retpolines and at that time will likely change the // name of the called thunk. Essentially, there is no hard guarantee that // LLVM will generate calls to specific thunks, we merely make a best-effort // attempt to help out kernels and other systems where duplicating the // thunks is costly. switch (Reg) { case X86::EAX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_eax"; case X86::ECX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_ecx"; case X86::EDX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_edx"; case X86::EDI: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__x86_indirect_thunk_edi"; case X86::R11: assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); return "__x86_indirect_thunk_r11"; } llvm_unreachable("unexpected reg for external indirect thunk"); } if (Subtarget.useRetpolineIndirectCalls() || Subtarget.useRetpolineIndirectBranches()) { // When targeting an internal COMDAT thunk use an LLVM-specific name. switch (Reg) { case X86::EAX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_eax"; case X86::ECX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_ecx"; case X86::EDX: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_edx"; case X86::EDI: assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"); return "__llvm_retpoline_edi"; case X86::R11: assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); return "__llvm_retpoline_r11"; } llvm_unreachable("unexpected reg for retpoline"); } if (Subtarget.useLVIControlFlowIntegrity()) { assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"); return "__llvm_lvi_thunk_r11"; } llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature"); } MachineBasicBlock * X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, MachineBasicBlock *BB) const { // Copy the virtual register into the R11 physical register and // call the retpoline thunk. const MIMetadata MIMD(MI); const X86InstrInfo *TII = Subtarget.getInstrInfo(); Register CalleeVReg = MI.getOperand(0).getReg(); unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode()); // Find an available scratch register to hold the callee. On 64-bit, we can // just use R11, but we scan for uses anyway to ensure we don't generate // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't // already a register use operand to the call to hold the callee. If none // are available, use EDI instead. EDI is chosen because EBX is the PIC base // register and ESI is the base pointer to realigned stack frames with VLAs. SmallVector AvailableRegs; if (Subtarget.is64Bit()) AvailableRegs.push_back(X86::R11); else AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); // Zero out any registers that are already used. for (const auto &MO : MI.operands()) { if (MO.isReg() && MO.isUse()) for (unsigned &Reg : AvailableRegs) if (Reg == MO.getReg()) Reg = 0; } // Choose the first remaining non-zero available register. unsigned AvailableReg = 0; for (unsigned MaybeReg : AvailableRegs) { if (MaybeReg) { AvailableReg = MaybeReg; break; } } if (!AvailableReg) report_fatal_error("calling convention incompatible with retpoline, no " "available registers"); const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg) .addReg(CalleeVReg); MI.getOperand(0).ChangeToES(Symbol); MI.setDesc(TII->get(Opc)); MachineInstrBuilder(*BB->getParent(), &MI) .addReg(AvailableReg, RegState::Implicit | RegState::Kill); return BB; } /// SetJmp implies future control flow change upon calling the corresponding /// LongJmp. /// Instead of using the 'return' instruction, the long jump fixes the stack and /// performs an indirect branch. To do so it uses the registers that were stored /// in the jump buffer (when calling SetJmp). /// In case the shadow stack is enabled we need to fix it as well, because some /// return addresses will be skipped. /// The function will save the SSP for future fixing in the function /// emitLongJmpShadowStackFix. /// \sa emitLongJmpShadowStackFix /// \param [in] MI The temporary Machine Instruction for the builtin. /// \param [in] MBB The Machine Basic Block that will be modified. void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { const MIMetadata MIMD(MI); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineInstrBuilder MIB; // Memory Reference. SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); // Initialize a register with zero. MVT PVT = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *PtrRC = getRegClassFor(PVT); Register ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc)) .addDef(ZReg) .addReg(ZReg, RegState::Undef) .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); // Write the SSP register value to offset 3 in input memory buffer. unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc)); const int64_t SSPOffset = 3 * PVT.getStoreSize(); const unsigned MemOpndSlot = 1; for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset); else MIB.add(MI.getOperand(MemOpndSlot + i)); } MIB.addReg(SSPCopyReg); MIB.setMemRefs(MMOs); } MachineBasicBlock * X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { const MIMetadata MIMD(MI); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); unsigned DstReg; unsigned MemOpndSlot = 0; unsigned CurOp = 0; DstReg = MI.getOperand(CurOp++).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); (void)TRI; Register mainDstReg = MRI.createVirtualRegister(RC); Register restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); // For v = setjmp(buf), we generate // // thisMBB: // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: // v_main = 0 // // sinkMBB: // v = phi(main, restore) // // restoreMBB: // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); restoreMBB->setMachineBlockAddressTaken(); MachineInstrBuilder MIB; // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); // thisMBB: unsigned PtrStoreOpc = 0; unsigned LabelReg = 0; const int64_t LabelOffset = 1 * PVT.getStoreSize(); bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && !isPositionIndependent(); // Prepare IP either in reg or imm. if (!UseImmLabel) { PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; const TargetRegisterClass *PtrRC = getRegClassFor(PVT); LabelReg = MRI.createVirtualRegister(PtrRC); if (Subtarget.is64Bit()) { MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg) .addReg(X86::RIP) .addImm(0) .addReg(0) .addMBB(restoreMBB) .addReg(0); } else { const X86InstrInfo *XII = static_cast(TII); MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg) .addReg(XII->getGlobalBaseReg(MF)) .addImm(0) .addReg(0) .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } } else PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; // Store IP MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc)); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else MIB.add(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); else MIB.addMBB(restoreMBB); MIB.setMemRefs(MMOs); if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) { emitSetJmpShadowStackFix(MI, thisMBB); } // Setup MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); // mainMBB: // EAX = 0 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg); mainMBB->addSuccessor(sinkMBB); // sinkMBB: BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg) .addReg(mainDstReg) .addMBB(mainMBB) .addReg(restoreDstReg) .addMBB(restoreMBB); // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo(); X86FI->setRestoreBasePointer(MF); Register FramePtr = RegInfo->getFrameRegister(*MF); Register BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI.eraseFromParent(); return sinkMBB; } /// Fix the shadow stack using the previously saved SSP pointer. /// \sa emitSetJmpShadowStackFix /// \param [in] MI The temporary Machine Instruction for the builtin. /// \param [in] MBB The Machine Basic Block that will be modified. /// \return The sink MBB that will perform the future indirect branch. MachineBasicBlock * X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const { const MIMetadata MIMD(MI); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); MVT PVT = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *PtrRC = getRegClassFor(PVT); // checkSspMBB: // xor vreg1, vreg1 // rdssp vreg1 // test vreg1, vreg1 // je sinkMBB # Jump if Shadow Stack is not supported // fallMBB: // mov buf+24/12(%rip), vreg2 // sub vreg1, vreg2 // jbe sinkMBB # No need to fix the Shadow Stack // fixShadowMBB: // shr 3/2, vreg2 // incssp vreg2 # fix the SSP according to the lower 8 bits // shr 8, vreg2 // je sinkMBB // fixShadowLoopPrepareMBB: // shl vreg2 // mov 128, vreg3 // fixShadowLoopMBB: // incssp vreg3 // dec vreg2 // jne fixShadowLoopMBB # Iterate until you finish fixing // # the Shadow Stack // sinkMBB: MachineFunction::iterator I = ++MBB->getIterator(); const BasicBlock *BB = MBB->getBasicBlock(); MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); MF->insert(I, checkSspMBB); MF->insert(I, fallMBB); MF->insert(I, fixShadowMBB); MF->insert(I, fixShadowLoopPrepareMBB); MF->insert(I, fixShadowLoopMBB); MF->insert(I, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI), MBB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MBB->addSuccessor(checkSspMBB); // Initialize a register with zero. Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass); BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg); if (PVT == MVT::i64) { Register TmpZReg = MRI.createVirtualRegister(PtrRC); BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg) .addImm(0) .addReg(ZReg) .addImm(X86::sub_32bit); ZReg = TmpZReg; } // Read the current SSP Register value to the zeroed register. Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); // Check whether the result of the SSP register is zero and jump directly // to the sink. unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr; BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc)) .addReg(SSPCopyReg) .addReg(SSPCopyReg); BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(sinkMBB) .addImm(X86::COND_E); checkSspMBB->addSuccessor(sinkMBB); checkSspMBB->addSuccessor(fallMBB); // Reload the previously saved SSP register value. Register PrevSSPReg = MRI.createVirtualRegister(PtrRC); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; const int64_t SPPOffset = 3 * PVT.getStoreSize(); MachineInstrBuilder MIB = BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { const MachineOperand &MO = MI.getOperand(i); if (i == X86::AddrDisp) MIB.addDisp(MO, SPPOffset); else if (MO.isReg()) // Don't add the whole operand, we don't want to // preserve kill flags. MIB.addReg(MO.getReg()); else MIB.add(MO); } MIB.setMemRefs(MMOs); // Subtract the current SSP from the previous SSP. Register SspSubReg = MRI.createVirtualRegister(PtrRC); unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr; BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg) .addReg(PrevSSPReg) .addReg(SSPCopyReg); // Jump to sink in case PrevSSPReg <= SSPCopyReg. BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(sinkMBB) .addImm(X86::COND_BE); fallMBB->addSuccessor(sinkMBB); fallMBB->addSuccessor(fixShadowMBB); // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8. unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri; unsigned Offset = (PVT == MVT::i64) ? 3 : 2; Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg) .addReg(SspSubReg) .addImm(Offset); // Increase SSP when looking only on the lower 8 bits of the delta. unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD; BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg); // Reset the lower 8 bits. Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg) .addReg(SspFirstShrReg) .addImm(8); // Jump if the result of the shift is zero. BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(sinkMBB) .addImm(X86::COND_E); fixShadowMBB->addSuccessor(sinkMBB); fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB); // Do a single shift left. unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri; Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg) .addReg(SspSecondShrReg) .addImm(1); // Save the value 128 to a register (will be used next with incssp). Register Value128InReg = MRI.createVirtualRegister(PtrRC); unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri; BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg) .addImm(128); fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB); // Since incssp only looks at the lower 8 bits, we might need to do several // iterations of incssp until we finish fixing the shadow stack. Register DecReg = MRI.createVirtualRegister(PtrRC); Register CounterReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg) .addReg(SspAfterShlReg) .addMBB(fixShadowLoopPrepareMBB) .addReg(DecReg) .addMBB(fixShadowLoopMBB); // Every iteration we increase the SSP by 128. BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg); // Every iteration we decrement the counter by 1. unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r; BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg); // Jump if the counter is not zero yet. BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1)) .addMBB(fixShadowLoopMBB) .addImm(X86::COND_NE); fixShadowLoopMBB->addSuccessor(sinkMBB); fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB); return sinkMBB; } MachineBasicBlock * X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, MachineBasicBlock *MBB) const { const MIMetadata MIMD(MI); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference SmallVector MMOs(MI.memoperands_begin(), MI.memoperands_end()); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; Register Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; Register SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; const int64_t LabelOffset = 1 * PVT.getStoreSize(); const int64_t SPOffset = 2 * PVT.getStoreSize(); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; MachineBasicBlock *thisMBB = MBB; // When CET and shadow stack is enabled, we need to fix the Shadow Stack. if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) { thisMBB = emitLongJmpShadowStackFix(MI, thisMBB); } // Reload FP MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { const MachineOperand &MO = MI.getOperand(i); if (MO.isReg()) // Don't add the whole operand, we don't want to // preserve kill flags. MIB.addReg(MO.getReg()); else MIB.add(MO); } MIB.setMemRefs(MMOs); // Reload IP MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { const MachineOperand &MO = MI.getOperand(i); if (i == X86::AddrDisp) MIB.addDisp(MO, LabelOffset); else if (MO.isReg()) // Don't add the whole operand, we don't want to // preserve kill flags. MIB.addReg(MO.getReg()); else MIB.add(MO); } MIB.setMemRefs(MMOs); // Reload SP MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), SPOffset); else MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's // the last instruction of the expansion. } MIB.setMemRefs(MMOs); // Jump BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp); MI.eraseFromParent(); return thisMBB; } void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { const MIMetadata MIMD(MI); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); unsigned Op = 0; unsigned VR = 0; bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && !isPositionIndependent(); if (UseImmLabel) { Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; } else { const TargetRegisterClass *TRC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; VR = MRI->createVirtualRegister(TRC); Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; if (Subtarget.is64Bit()) BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR) .addReg(X86::RIP) .addImm(1) .addReg(0) .addMBB(DispatchBB) .addReg(0); else BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR) .addReg(0) /* TII->getGlobalBaseReg(MF) */ .addImm(1) .addReg(0) .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) .addReg(0); } MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op)); addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36); if (UseImmLabel) MIB.addMBB(DispatchBB); else MIB.addReg(VR); } MachineBasicBlock * X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *BB) const { const MIMetadata MIMD(MI); MachineFunction *MF = BB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); int FI = MF->getFrameInfo().getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (auto &MBB : *MF) { if (!MBB.isEHPad()) continue; MCSymbol *Sym = nullptr; for (const auto &MI : MBB) { if (MI.isDebugInstr()) continue; assert(MI.isEHLabel() && "expected EH_LABEL"); Sym = MI.getOperand(0).getMCSymbol(); break; } if (!MF->hasCallSiteLandingPad(Sym)) continue; for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) { CallSiteNumToLPad[CSI].push_back(&MBB); MaxCSNum = std::max(MaxCSNum, CSI); } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { for (auto &LP : CallSiteNumToLPad[CSI]) { LPadList.push_back(LP); InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // Create the MBBs for the dispatch code. // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(true); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); BuildMI(TrapBB, MIMD, TII->get(X86::TRAP)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert MBBs. MF->push_back(DispatchBB); MF->push_back(DispContBB); MF->push_back(TrapBB); // Insert code into the entry block that creates and registers the function // context. SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); // Create the jump table and associated information unsigned JTE = getJumpTableEncoding(); MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE); unsigned MJTI = JTI->createJumpTableIndex(LPadList); const X86RegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { const bool FPIs64Bit = Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *MFI = MF->getInfo(); MFI->setRestoreBasePointer(MF); Register FP = RI.getFrameRegister(*MF); Register BP = RI.getBaseRegister(); unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true, MFI->getRestoreBasePointerOffset()) .addRegMask(RI.getNoPreservedMask()); } else { BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP)) .addRegMask(RI.getNoPreservedMask()); } // IReg is used as an index in a memory operand and therefore can't be SP Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI, Subtarget.is64Bit() ? 8 : 4); BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1)) .addMBB(TrapBB) .addImm(X86::COND_AE); if (Subtarget.is64Bit()) { Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass); Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); // leaq .LJTI0_0(%rip), BReg BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg) .addReg(X86::RIP) .addImm(1) .addReg(0) .addJumpTableIndex(MJTI) .addReg(0); // movzx IReg64, IReg BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64) .addImm(0) .addReg(IReg) .addImm(X86::sub_32bit); switch (JTE) { case MachineJumpTableInfo::EK_BlockAddress: // jmpq *(BReg,IReg64,8) BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m)) .addReg(BReg) .addImm(8) .addReg(IReg64) .addImm(0) .addReg(0); break; case MachineJumpTableInfo::EK_LabelDifference32: { Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass); Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass); // movl (BReg,IReg64,4), OReg BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg) .addReg(BReg) .addImm(4) .addReg(IReg64) .addImm(0) .addReg(0); // movsx OReg64, OReg BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64) .addReg(OReg); // addq BReg, OReg64, TReg BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg) .addReg(OReg64) .addReg(BReg); // jmpq *TReg BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg); break; } default: llvm_unreachable("Unexpected jump table encoding"); } } else { // jmpl *.LJTI0_0(,IReg,4) BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m)) .addReg(0) .addImm(4) .addReg(IReg) .addJumpTableIndex(MJTI) .addReg(0); } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (auto &LP : LPadList) if (SeenMBBs.insert(LP).second) DispContBB->addSuccessor(LP); // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector MBBLPads; const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. // Keep a copy of Successors since it's modified inside the loop. SmallVector Successors(MBB->succ_rbegin(), MBB->succ_rend()); // FIXME: Avoid quadratic complexity. for (auto *MBBS : Successors) { if (MBBS->isEHPad()) { MBB->removeSuccessor(MBBS); MBBLPads.push_back(MBBS); } } MBB->addSuccessor(DispatchBB); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (auto &II : reverse(*MBB)) { if (!II.isCall()) continue; DenseMap DefRegs; for (auto &MOp : II.operands()) if (MOp.isReg()) DefRegs[MOp.getReg()] = true; MachineInstrBuilder MIB(*MF, &II); for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) { unsigned Reg = SavedRegs[RegIdx]; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (auto &LP : MBBLPads) LP->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); return BB; } MachineBasicBlock * X86TargetLowering::emitPatchableEventCall(MachineInstr &MI, MachineBasicBlock *BB) const { // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing // calls may require proper stack alignment. const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MIMetadata MIMD(MI); MachineFunction &MF = *BB->getParent(); // Emit CALLSEQ_START right before the instruction. MF.getFrameInfo().setAdjustsStack(true); unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); MachineInstrBuilder CallseqEnd = BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0); BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); return BB; } MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); const MIMetadata MIMD(MI); auto TMMImmToTMMReg = [](unsigned Imm) { assert (Imm < 8 && "Illegal tmm index"); return X86::TMM0 + Imm; }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: case X86::TLS_addr64: case X86::TLS_addrX32: case X86::TLS_base_addr32: case X86::TLS_base_addr64: case X86::TLS_base_addrX32: case X86::TLS_desc32: case X86::TLS_desc64: return EmitLoweredTLSAddr(MI, BB); case X86::INDIRECT_THUNK_CALL32: case X86::INDIRECT_THUNK_CALL64: case X86::INDIRECT_THUNK_TCRETURN32: case X86::INDIRECT_THUNK_TCRETURN64: return EmitLoweredIndirectThunk(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::PROBED_ALLOCA_32: case X86::PROBED_ALLOCA_64: return EmitLoweredProbedAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); case X86::CMOV_FR16: case X86::CMOV_FR16X: case X86::CMOV_FR32: case X86::CMOV_FR32X: case X86::CMOV_FR64: case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: case X86::CMOV_VR64: case X86::CMOV_VR128: case X86::CMOV_VR128X: case X86::CMOV_VR256: case X86::CMOV_VR256X: case X86::CMOV_VR512: case X86::CMOV_VK1: case X86::CMOV_VK2: case X86::CMOV_VK4: case X86::CMOV_VK8: case X86::CMOV_VK16: case X86::CMOV_VK32: case X86::CMOV_VK64: return EmitLoweredSelect(MI, BB); case X86::FP80_ADDr: case X86::FP80_ADDm32: { // Change the floating point control register to use double extended // precision when performing the addition. int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); // Load the old value of the control word... Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW), OrigCWFrameIdx); // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended // precision. Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW) .addReg(OldCW, RegState::Kill) .addImm(0x300); // Extract to 16 bits. Register NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16) .addReg(NewCW, RegState::Kill, X86::sub_16bit); // Prepare memory for FLDCW. int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)), NewCWFrameIdx) .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)), NewCWFrameIdx); // Do the addition. if (MI.getOpcode() == X86::FP80_ADDr) { BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80)) .add(MI.getOperand(0)) .add(MI.getOperand(1)) .add(MI.getOperand(2)); } else { BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32)) .add(MI.getOperand(0)) .add(MI.getOperand(1)) .add(MI.getOperand(2)) .add(MI.getOperand(3)) .add(MI.getOperand(4)) .add(MI.getOperand(5)) .add(MI.getOperand(6)); } // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)), OrigCWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: case X86::FP64_TO_INT16_IN_MEM: case X86::FP64_TO_INT32_IN_MEM: case X86::FP64_TO_INT64_IN_MEM: case X86::FP80_TO_INT16_IN_MEM: case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); // Load the old value of the control word... Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW), OrigCWFrameIdx); // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW) .addReg(OldCW, RegState::Kill).addImm(0xC00); // Extract to 16 bits. Register NewCW16 = MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16) .addReg(NewCW, RegState::Kill, X86::sub_16bit); // Prepare memory for FLDCW. int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)), NewCWFrameIdx) .addReg(NewCW16, RegState::Kill); // Reload the modified control word now... addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)), NewCWFrameIdx); // Get the X86 opcode to use. unsigned Opc; switch (MI.getOpcode()) { // clang-format off default: llvm_unreachable("illegal opcode!"); case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; // clang-format on } X86AddressMode AM = getAddressFromInstr(&MI, 0); addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM) .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); // Reload the original control word now. addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)), OrigCWFrameIdx); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } // xbegin case X86::XBEGIN: return emitXBegin(MI, BB, Subtarget.getInstrInfo()); case X86::VAARG_64: case X86::VAARG_X32: return EmitVAARGWithCustomInserter(MI, BB); case X86::EH_SjLj_SetJmp32: case X86::EH_SjLj_SetJmp64: return emitEHSjLjSetJmp(MI, BB); case X86::EH_SjLj_LongJmp32: case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); case X86::Int_eh_sjlj_setup_dispatch: return EmitSjLjDispatchBlock(MI, BB); case TargetOpcode::STATEPOINT: // As an implementation detail, STATEPOINT shares the STACKMAP format at // this point in the process. We diverge later. return emitPatchPoint(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); case TargetOpcode::PATCHABLE_EVENT_CALL: case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: return emitPatchableEventCall(MI, BB); case X86::LCMPXCHG8B: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B // requires a memory operand. If it happens that current architecture is // i686 and for current function we need a base pointer // - which is ESI for i686 - register allocator would not be able to // allocate registers for an address in form of X(%reg, %reg, Y) // - there never would be enough unreserved registers during regalloc // (without the need for base ptr the only option would be X(%edi, %esi, Y). // We are giving a hand to register allocator by precomputing the address in // a new vreg using LEA. // If it is not i686 or there is no base pointer - nothing to do here. if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF)) return BB; // Even though this code does not necessarily needs the base pointer to // be ESI, we check for that. The reason: if this assert fails, there are // some changes happened in the compiler base pointer handling, which most // probably have to be addressed somehow here. assert(TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind"); MachineRegisterInfo &MRI = MF->getRegInfo(); MVT SPTy = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); X86AddressMode AM = getAddressFromInstr(&MI, 0); // Regalloc does not need any help when the memory operand of CMPXCHG8B // does not use index register. if (AM.IndexReg == X86::NoRegister) return BB; // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its // four operand definitions that are E[ABCD] registers. We skip them and // then insert the LEA. MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator()); while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) || RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) || RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) || RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) { ++RMBBI; } MachineBasicBlock::iterator MBBI(RMBBI); addFullAddress( BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM); setDirectAddressInInstr(&MI, 0, computedAddrVReg); return BB; } case X86::LCMPXCHG16B_NO_RBX: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); Register BasePtr = TRI->getBaseRegister(); if (TRI->hasBasePointer(*MF) && (BasePtr == X86::RBX || BasePtr == X86::EBX)) { if (!BB->isLiveIn(BasePtr)) BB->addLiveIn(BasePtr); // Save RBX into a virtual register. Register SaveRBX = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX) .addReg(X86::RBX); Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst); for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) MIB.add(MI.getOperand(Idx)); MIB.add(MI.getOperand(X86::AddrNumOperands)); MIB.addReg(SaveRBX); } else { // Simple case, just copy the virtual register to RBX. BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX) .add(MI.getOperand(X86::AddrNumOperands)); MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B)); for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) MIB.add(MI.getOperand(Idx)); } MI.eraseFromParent(); return BB; } case X86::MWAITX: { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); Register BasePtr = TRI->getBaseRegister(); bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX); // If no need to save the base pointer, we generate MWAITXrrr, // else we generate pseudo MWAITX_SAVE_RBX. if (!IsRBX || !TRI->hasBasePointer(*MF)) { BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX) .addReg(MI.getOperand(0).getReg()); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX) .addReg(MI.getOperand(1).getReg()); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX) .addReg(MI.getOperand(2).getReg()); BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr)); MI.eraseFromParent(); } else { if (!BB->isLiveIn(BasePtr)) { BB->addLiveIn(BasePtr); } // Parameters can be copied into ECX and EAX but not EBX yet. BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX) .addReg(MI.getOperand(0).getReg()); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX) .addReg(MI.getOperand(1).getReg()); assert(Subtarget.is64Bit() && "Expected 64-bit mode!"); // Save RBX into a virtual register. Register SaveRBX = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX) .addReg(X86::RBX); // Generate mwaitx pseudo. Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX)) .addDef(Dst) // Destination tied in with SaveRBX. .addReg(MI.getOperand(2).getReg()) // input value of EBX. .addUse(SaveRBX); // Save of base pointer. MI.eraseFromParent(); } return BB; } case TargetOpcode::PREALLOCATED_SETUP: { assert(Subtarget.is32Bit() && "preallocated only used in 32-bit"); auto *MFI = MF->getInfo(); MFI->setHasPreallocatedCall(true); int64_t PreallocatedId = MI.getOperand(0).getImm(); size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId); assert(StackAdjustment != 0 && "0 stack adjustment"); LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment " << StackAdjustment << "\n"); BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP) .addReg(X86::ESP) .addImm(StackAdjustment); MI.eraseFromParent(); return BB; } case TargetOpcode::PREALLOCATED_ARG: { assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit"); int64_t PreallocatedId = MI.getOperand(1).getImm(); int64_t ArgIdx = MI.getOperand(2).getImm(); auto *MFI = MF->getInfo(); size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx]; LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx << ", arg offset " << ArgOffset << "\n"); // stack pointer + offset addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r), MI.getOperand(0).getReg()), X86::ESP, false, ArgOffset); MI.eraseFromParent(); return BB; } case X86::PTDPBSSD: case X86::PTDPBSUD: case X86::PTDPBUSD: case X86::PTDPBUUD: case X86::PTDPBF16PS: case X86::PTDPFP16PS: { unsigned Opc; switch (MI.getOpcode()) { // clang-format off default: llvm_unreachable("illegal opcode!"); case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; // clang-format on } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::PTILEZERO: { unsigned Imm = MI.getOperand(0).getImm(); BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); MI.eraseFromParent(); // The pseudo is gone now. auto *MFI = MF->getInfo(); MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); return BB; } case X86::PTILEZEROV: { auto *MFI = MF->getInfo(); MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); return BB; } case X86::PTILELOADD: case X86::PTILELOADDT1: case X86::PTILESTORED: { unsigned Opc; switch (MI.getOpcode()) { default: llvm_unreachable("illegal opcode!"); #define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) case X86::PTILELOADD: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); break; case X86::PTILELOADDT1: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1); break; case X86::PTILESTORED: Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED); break; #undef GET_EGPR_IF_ENABLED } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); unsigned CurOp = 0; if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX) MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), RegState::Define); MIB.add(MI.getOperand(CurOp++)); // base MIB.add(MI.getOperand(CurOp++)); // scale MIB.add(MI.getOperand(CurOp++)); // index -- stride MIB.add(MI.getOperand(CurOp++)); // displacement MIB.add(MI.getOperand(CurOp++)); // segment if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX) MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), RegState::Undef); MI.eraseFromParent(); // The pseudo is gone now. return BB; } case X86::PTCMMIMFP16PS: case X86::PTCMMRLFP16PS: { const MIMetadata MIMD(MI); unsigned Opc; switch (MI.getOpcode()) { // clang-format off default: llvm_unreachable("Unexpected instruction!"); case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; // clang-format on } MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); MI.eraseFromParent(); // The pseudo is gone now. return BB; } } } //===----------------------------------------------------------------------===// // X86 Optimization Hooks //===----------------------------------------------------------------------===// bool X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const { EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); unsigned EltSize = VT.getScalarSizeInBits(); if (VT.isVector()) { // If the constant is only all signbits in the active bits, then we should // extend it to the entire constant to allow it act as a boolean constant // vector. auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) { if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode())) return false; for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) { if (!DemandedElts[i] || V.getOperand(i).isUndef()) continue; const APInt &Val = V.getConstantOperandAPInt(i); if (Val.getBitWidth() > Val.getNumSignBits() && Val.trunc(ActiveBits).getNumSignBits() == ActiveBits) return true; } return false; }; // For vectors - if we have a constant, then try to sign extend. // TODO: Handle AND cases. unsigned ActiveBits = DemandedBits.getActiveBits(); if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) && (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) && NeedsSignExtension(Op.getOperand(1), ActiveBits)) { EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits); EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT, VT.getVectorNumElements()); SDValue NewC = TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT, Op.getOperand(1), TLO.DAG.getValueType(ExtVT)); SDValue NewOp = TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC); return TLO.CombineTo(Op, NewOp); } return false; } // Only optimize Ands to prevent shrinking a constant that could be // matched by movzx. if (Opcode != ISD::AND) return false; // Make sure the RHS really is a constant. ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; const APInt &Mask = C->getAPIntValue(); // Clear all non-demanded bits initially. APInt ShrunkMask = Mask & DemandedBits; // Find the width of the shrunk mask. unsigned Width = ShrunkMask.getActiveBits(); // If the mask is all 0s there's nothing to do here. if (Width == 0) return false; // Find the next power of 2 width, rounding up to a byte. Width = llvm::bit_ceil(std::max(Width, 8U)); // Truncate the width to size to handle illegal types. Width = std::min(Width, EltSize); // Calculate a possible zero extend mask for this constant. APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width); // If we aren't changing the mask, just return true to keep it and prevent // the caller from optimizing. if (ZeroExtendMask == Mask) return true; // Make sure the new mask can be represented by a combination of mask bits // and non-demanded bits. if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits)) return false; // Replace the constant with the zero extend mask. SDLoc DL(Op); SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT); SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); return TLO.CombineTo(Op, NewOp); } static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) { KnownBits Known2; unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1); Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1); Known = KnownBits::abdu(Known, Known2).zext(16); // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7))) Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true, Known, Known); Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true, Known, Known); Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true, Known, Known); Known = Known.zext(64); } static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) { unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs. APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); APInt DemandedLoElts = DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01)); APInt DemandedHiElts = DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10)); KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1); KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1); KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1); KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1); KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32)); KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32)); Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/false, /*NUW=*/false, Lo, Hi); } static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) { unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi // pairs. APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); APInt DemandedLoElts = DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01)); APInt DemandedHiElts = DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10)); KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1); KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1); KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1); KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1); KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16)); KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16)); Known = KnownBits::sadd_sat(Lo, Hi); } static KnownBits computeKnownBitsForHorizontalOperation( const SDValue Op, const APInt &DemandedElts, unsigned Depth, const SelectionDAG &DAG, const function_ref KnownBitsFunc) { APInt DemandedEltsLHS, DemandedEltsRHS; getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(), DemandedElts, DemandedEltsLHS, DemandedEltsRHS); const auto ComputeForSingleOpFunc = [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) { return KnownBitsFunc( DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1), DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1)); }; if (DemandedEltsRHS.isZero()) return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS); if (DemandedEltsLHS.isZero()) return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS); return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS) .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS)); } void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = Known.getBitWidth(); unsigned NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!"); Known.resetAll(); switch (Opc) { default: break; case X86ISD::MUL_IMM: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known = KnownBits::mul(Known, Known2); break; } case X86ISD::SETCC: Known.Zero.setBitsFrom(1); break; case X86ISD::MOVMSK: { unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); Known.Zero.setBitsFrom(NumLoBits); break; } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); Known = Known.anyextOrTrunc(BitWidth); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } case X86ISD::VSRAI: case X86ISD::VSHLI: case X86ISD::VSRLI: { unsigned ShAmt = Op.getConstantOperandVal(1); if (ShAmt >= VT.getScalarSizeInBits()) { // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. if (Opc != X86ISD::VSRAI) { Known.setAllZero(); break; } ShAmt = VT.getScalarSizeInBits() - 1; } Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); if (Opc == X86ISD::VSHLI) { Known.Zero <<= ShAmt; Known.One <<= ShAmt; // Low bits are known zero. Known.Zero.setLowBits(ShAmt); } else if (Opc == X86ISD::VSRLI) { Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // High bits are known zero. Known.Zero.setHighBits(ShAmt); } else { Known.Zero.ashrInPlace(ShAmt); Known.One.ashrInPlace(ShAmt); } break; } case X86ISD::PACKUS: { // PACKUS is just a truncation if the upper half is zero. APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); Known.One = APInt::getAllOnes(BitWidth * 2); Known.Zero = APInt::getAllOnes(BitWidth * 2); KnownBits Known2; if (!!DemandedLHS) { Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1); Known = Known.intersectWith(Known2); } if (!!DemandedRHS) { Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1); Known = Known.intersectWith(Known2); } if (Known.countMinLeadingZeros() < BitWidth) Known.resetAll(); Known = Known.trunc(BitWidth); break; } case X86ISD::PSHUFB: { SDValue Src = Op.getOperand(0); SDValue Idx = Op.getOperand(1); // If the index vector is never negative (MSB is zero), then all elements // come from the source vector. This is useful for cases where // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling // below will handle the more common constant shuffle mask case. KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1); if (KnownIdx.isNonNegative()) Known = DAG.computeKnownBits(Src, Depth + 1); break; } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); if (!Src.getSimpleValueType().isVector()) { Known = DAG.computeKnownBits(Src, Depth + 1); return; } break; } case X86ISD::AND: { if (Op.getResNo() == 0) { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known &= Known2; } break; } case X86ISD::ANDNP: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); // ANDNP = (~X & Y); Known.One &= Known2.Zero; Known.Zero |= Known2.One; break; } case X86ISD::FOR: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known |= Known2; break; } case X86ISD::PSADBW: { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); assert(VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"); computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth); break; } case X86ISD::PCMPGT: case X86ISD::PCMPEQ: { KnownBits KnownLhs = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); KnownBits KnownRhs = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); std::optional Res = Opc == X86ISD::PCMPEQ ? KnownBits::eq(KnownLhs, KnownRhs) : KnownBits::sgt(KnownLhs, KnownRhs); if (Res) { if (*Res) Known.setAllOnes(); else Known.setAllZero(); } break; } case X86ISD::VPMADDWD: { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); assert(VT.getVectorElementType() == MVT::i32 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getVectorElementType() == MVT::i16 && "Unexpected PMADDWD types"); computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth); break; } case X86ISD::VPMADDUBSW: { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); assert(VT.getVectorElementType() == MVT::i16 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getVectorElementType() == MVT::i8 && "Unexpected PMADDUBSW types"); computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth); break; } case X86ISD::PMULUDQ: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known = Known.trunc(BitWidth / 2).zext(BitWidth); Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth); Known = KnownBits::mul(Known, Known2); break; } case X86ISD::CMOV: { Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); // If we don't know any bits, early out. if (Known.isUnknown()) break; KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); // Only known if known in both the LHS and RHS. Known = Known.intersectWith(Known2); break; } case X86ISD::BEXTR: case X86ISD::BEXTRI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (auto* Cst1 = dyn_cast(Op1)) { unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); // If the length is 0, the result is 0. if (Length == 0) { Known.setAllZero(); break; } if ((Shift + Length) <= BitWidth) { Known = DAG.computeKnownBits(Op0, Depth + 1); Known = Known.extractBits(Length, Shift); Known = Known.zextOrTrunc(BitWidth); } } break; } case X86ISD::PDEP: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); // Zeros are retained from the mask operand. But not ones. Known.One.clearAllBits(); // The result will have at least as many trailing zeros as the non-mask // operand since bits can only map to the same or higher bit position. Known.Zero.setLowBits(Known2.countMinTrailingZeros()); break; } case X86ISD::PEXT: { Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); // The result has as many leading zeros as the number of zeroes in the mask. unsigned Count = Known.Zero.popcount(); Known.Zero = APInt::getHighBitsSet(BitWidth, Count); Known.One.clearAllBits(); break; } case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: case X86ISD::VFPROUND: case X86ISD::VMFPROUND: case X86ISD::CVTPS2PH: case X86ISD::MCVTPS2PH: { // Truncations/Conversions - upper elements are known zero. EVT SrcVT = Op.getOperand(0).getValueType(); if (SrcVT.isVector()) { unsigned NumSrcElts = SrcVT.getVectorNumElements(); if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts) Known.setAllZero(); } break; } case X86ISD::STRICT_CVTTP2SI: case X86ISD::STRICT_CVTTP2UI: case X86ISD::STRICT_CVTSI2P: case X86ISD::STRICT_CVTUI2P: case X86ISD::STRICT_VFPROUND: case X86ISD::STRICT_CVTPS2PH: { // Strict Conversions - upper elements are known zero. EVT SrcVT = Op.getOperand(1).getValueType(); if (SrcVT.isVector()) { unsigned NumSrcElts = SrcVT.getVectorNumElements(); if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts) Known.setAllZero(); } break; } case X86ISD::MOVQ2DQ: { // Move from MMX to XMM. Upper half of XMM should be 0. if (DemandedElts.countr_zero() >= (NumElts / 2)) Known.setAllZero(); break; } case X86ISD::VBROADCAST_LOAD: { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits, /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) { Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned I = 0; I != NumElts; ++I) { if (!DemandedElts[I]) continue; if (UndefElts[I]) { Known.resetAll(); break; } KnownBits Known2 = KnownBits::makeConstant(EltBits[I]); Known = Known.intersectWith(Known2); } return; } break; } case X86ISD::HADD: case X86ISD::HSUB: { Known = computeKnownBitsForHorizontalOperation( Op, DemandedElts, Depth, DAG, [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) { return KnownBits::computeForAddSub( /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false, KnownLHS, KnownRHS); }); break; } case ISD::INTRINSIC_WO_CHAIN: { switch (Op->getConstantOperandVal(0)) { case Intrinsic::x86_sse2_pmadd_wd: case Intrinsic::x86_avx2_pmadd_wd: case Intrinsic::x86_avx512_pmaddw_d_512: { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); assert(VT.getScalarType() == MVT::i32 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i16 && "Unexpected PMADDWD types"); computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth); break; } case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); assert(VT.getScalarType() == MVT::i16 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && "Unexpected PMADDUBSW types"); computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth); break; } case Intrinsic::x86_sse2_psad_bw: case Intrinsic::x86_avx2_psad_bw: case Intrinsic::x86_avx512_psad_bw_512: { SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); assert(VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"); computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth); break; } } break; } } // Handle target shuffles. // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. if (isTargetShuffle(Opc)) { SmallVector Mask; SmallVector Ops; if (getTargetShuffleMask(Op, true, Ops, Mask)) { unsigned NumOps = Ops.size(); unsigned NumElts = VT.getVectorNumElements(); if (Mask.size() == NumElts) { SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); Known.Zero.setAllBits(); Known.One.setAllBits(); for (unsigned i = 0; i != NumElts; ++i) { if (!DemandedElts[i]) continue; int M = Mask[i]; if (M == SM_SentinelUndef) { // For UNDEF elements, we don't know anything about the common state // of the shuffle result. Known.resetAll(); break; } if (M == SM_SentinelZero) { Known.One.clearAllBits(); continue; } assert(0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range"); unsigned OpIdx = (unsigned)M / NumElts; unsigned EltIdx = (unsigned)M % NumElts; if (Ops[OpIdx].getValueType() != VT) { // TODO - handle target shuffle ops with different value types. Known.resetAll(); break; } DemandedOps[OpIdx].setBit(EltIdx); } // Known bits are the values that are shared by every demanded element. for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) { if (!DemandedOps[i]) continue; KnownBits Known2 = DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1); Known = Known.intersectWith(Known2); } } } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { EVT VT = Op.getValueType(); unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { case X86ISD::SETCC_CARRY: // SETCC_CARRY sets the dest to ~0 for true or 0 for false. return VTBits; case X86ISD::VTRUNC: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); unsigned NumSrcBits = SrcVT.getScalarSizeInBits(); assert(VTBits < NumSrcBits && "Illegal truncation input type"); APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1); if (Tmp > (NumSrcBits - VTBits)) return Tmp - (NumSrcBits - VTBits); return 1; } case X86ISD::PACKSS: { // PACKSS is just a truncation if the sign bits extend to the packed size. APInt DemandedLHS, DemandedRHS; getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS, DemandedRHS); // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y))) // patterns often used to compact vXi64 allsignbit patterns. auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned { SDValue BC = peekThroughBitcasts(V); if (BC.getOpcode() == X86ISD::PACKSS && BC.getScalarValueSizeInBits() == 16 && V.getScalarValueSizeInBits() == 32) { SDValue BC0 = peekThroughBitcasts(BC.getOperand(0)); SDValue BC1 = peekThroughBitcasts(BC.getOperand(1)); if (BC0.getScalarValueSizeInBits() == 64 && BC1.getScalarValueSizeInBits() == 64 && DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 && DAG.ComputeNumSignBits(BC1, Depth + 1) == 64) return 32; } return DAG.ComputeNumSignBits(V, Elts, Depth + 1); }; unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); unsigned Tmp0 = SrcBits, Tmp1 = SrcBits; if (!!DemandedLHS) Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS); if (!!DemandedRHS) Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS); unsigned Tmp = std::min(Tmp0, Tmp1); if (Tmp > (SrcBits - VTBits)) return Tmp - (SrcBits - VTBits); return 1; } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); if (!Src.getSimpleValueType().isVector()) return DAG.ComputeNumSignBits(Src, Depth + 1); break; } case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); const APInt &ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); if (ShiftVal.uge(Tmp)) return 1; // Shifted all sign bits out --> unknown. return Tmp - ShiftVal.getZExtValue(); } case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); APInt ShiftVal = Op.getConstantOperandAPInt(1); if (ShiftVal.uge(VTBits - 1)) return VTBits; // Sign splat. unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); ShiftVal += Tmp; return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); } case X86ISD::FSETCC: // cmpss/cmpsd return zero/all-bits result values in the bottom element. if (VT == MVT::f32 || VT == MVT::f64 || ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1)) return VTBits; break; case X86ISD::PCMPGT: case X86ISD::PCMPEQ: case X86ISD::CMPP: case X86ISD::VPCOM: case X86ISD::VPCOMU: // Vector compares return zero/all-bits result values. return VTBits; case X86ISD::ANDNP: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); if (Tmp0 == 1) return 1; // Early out. unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); return std::min(Tmp0, Tmp1); } case X86ISD::CMOV: { unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); if (Tmp0 == 1) return 1; // Early out. unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1); return std::min(Tmp0, Tmp1); } } // Handle target shuffles. // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. if (isTargetShuffle(Opcode)) { SmallVector Mask; SmallVector Ops; if (getTargetShuffleMask(Op, true, Ops, Mask)) { unsigned NumOps = Ops.size(); unsigned NumElts = VT.getVectorNumElements(); if (Mask.size() == NumElts) { SmallVector DemandedOps(NumOps, APInt(NumElts, 0)); for (unsigned i = 0; i != NumElts; ++i) { if (!DemandedElts[i]) continue; int M = Mask[i]; if (M == SM_SentinelUndef) { // For UNDEF elements, we don't know anything about the common state // of the shuffle result. return 1; } else if (M == SM_SentinelZero) { // Zero = all sign bits. continue; } assert(0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range"); unsigned OpIdx = (unsigned)M / NumElts; unsigned EltIdx = (unsigned)M % NumElts; if (Ops[OpIdx].getValueType() != VT) { // TODO - handle target shuffle ops with different value types. return 1; } DemandedOps[OpIdx].setBit(EltIdx); } unsigned Tmp0 = VTBits; for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) { if (!DemandedOps[i]) continue; unsigned Tmp1 = DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1); Tmp0 = std::min(Tmp0, Tmp1); } return Tmp0; } } } // Fallback case. return 1; } SDValue X86TargetLowering::unwrapAddress(SDValue N) const { if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) return N->getOperand(0); return N; } // Helper to look for a normal load that can be narrowed into a vzload with the // specified VT and memory VT. Returns SDValue() on failure. static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG) { // Can't if the load is volatile or atomic. if (!LN->isSimple()) return SDValue(); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT, LN->getPointerInfo(), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); } // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryShuffle(MVT MaskVT, ArrayRef Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction. if (Mask[0] == 0 && (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) { if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) || (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) { Shuffle = X86ISD::VZEXT_MOVL; if (MaskEltSize == 16) SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16); else SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } } // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize && DAG.ComputeNumSignBits(V1) == MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool MatchAny = true; bool MatchZero = true; bool MatchSign = UseSign; unsigned NumDstElts = NumMaskElts / Scale; for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) { if (!isUndefOrEqual(Mask[i * Scale], (int)i)) { MatchAny = MatchSign = MatchZero = false; break; } unsigned Pos = (i * Scale) + 1; unsigned Len = Scale - 1; MatchAny &= isUndefInRange(Mask, Pos, Len); MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len); MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len); } if (MatchAny || MatchSign || MatchZero) { assert((MatchSign || MatchZero) && "Failed to match sext/zext but matched aext?"); unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); Shuffle = unsigned( MatchAny ? ISD::ANY_EXTEND : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND)); if (SrcVT.getVectorNumElements() != NumDstElts) Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); return true; } } } // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) || (MaskEltSize == 16 && Subtarget.hasFP16())) && isUndefOrEqual(Mask[0], 0) && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; if (MaskEltSize == 16) SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16); else SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v4f32; return true; } } if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v4f64; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v8f32; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v8f32; return true; } } if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, V1)) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v8f64; return true; } if (isTargetShuffleEquivalent( MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) { Shuffle = X86ISD::MOVSLDUP; SrcVT = DstVT = MVT::v16f32; return true; } if (isTargetShuffleEquivalent( MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) { Shuffle = X86ISD::MOVSHDUP; SrcVT = DstVT = MVT::v16f32; return true; } } return false; } // Attempt to match a combined shuffle mask against supported unary immediate // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); bool ContainsZeros = isAnyZero(Mask); // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. if (!ContainsZeros && MaskScalarSizeInBits == 64) { // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { Shuffle = X86ISD::VPERMI; ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { SmallVector RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); PermuteImm = getV4X86ShuffleImm(RepeatedMask); return true; } } } else if (AllowFloatDomain && Subtarget.hasAVX()) { // VPERMILPD can permute with a non-repeating shuffle. Shuffle = X86ISD::VPERMILPI; ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); PermuteImm = 0; for (int i = 0, e = Mask.size(); i != e; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); PermuteImm |= (M & 1) << i; } return true; } } // We are checking for shuffle match or shift match. Loop twice so we can // order which we try and match first depending on target preference. for (unsigned Order = 0; Order < 2; ++Order) { if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) { // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { // Narrow the repeated mask to create 32-bit element permutes. SmallVector WordMask = RepeatedMask; if (MaskScalarSizeInBits == 64) narrowShuffleMaskElts(2, RepeatedMask, WordMask); Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; } } // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef LoMask(RepeatedMask.data() + 0, 4); ArrayRef HiMask(RepeatedMask.data() + 4, 4); // PSHUFLW: permute lower 4 elements only. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { Shuffle = X86ISD::PSHUFLW; ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(LoMask); return true; } // PSHUFHW: permute upper 4 elements only. if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { // Offset the HiMask so that we can create the shuffle immediate. int OffsetHiMask[4]; for (int i = 0; i != 4; ++i) OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); Shuffle = X86ISD::PSHUFHW; ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } } } } else { // Attempt to match against bit rotates. if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512())) { int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, Subtarget, Mask); if (0 < RotateAmt) { Shuffle = X86ISD::VROTLI; PermuteImm = (unsigned)RotateAmt; return true; } } } // Attempt to match against byte/bit shifts. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, Zeroable, Subtarget); if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || 32 <= ShuffleVT.getScalarSizeInBits())) { // Byte shifts can be slower so only match them on second attempt. if (Order == 0 && (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ)) continue; PermuteImm = (unsigned)ShiftAmt; return true; } } } return false; } // Attempt to match a combined unary shuffle mask against supported binary // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryShuffle(MVT MaskVT, ArrayRef Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); unsigned SizeInBits = MaskVT.getSizeInBits(); if (MaskVT.is128BitVector()) { if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) && AllowFloatDomain) { V2 = V1; V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1); Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) && AllowFloatDomain) { V2 = V1; Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) && Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; SrcVT = DstVT = MVT::v2f64; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}, DAG) && Subtarget.hasFP16()) { Shuffle = X86ISD::MOVSH; SrcVT = DstVT = MVT::v8f16; return true; } } // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle. if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) || ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) || ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) { if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, Subtarget)) { DstVT = MaskVT; return true; } } // TODO: Can we handle this inside matchShuffleWithPACK? if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() && isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) && V1.getScalarValueSizeInBits() == 64 && V2.getScalarValueSizeInBits() == 64) { // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits. unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros(); unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros(); if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) { SrcVT = MVT::v4i32; DstVT = MVT::v8i16; Shuffle = X86ISD::PACKUS; return true; } // Use PACKUSBW if the leading zerobits goto the lowest 8-bits. if (MinLZV1 >= 56 && MinLZV2 >= 56) { SrcVT = MVT::v8i16; DstVT = MVT::v16i8; Shuffle = X86ISD::PACKUS; return true; } // Use PACKSSWD if the signbits extend to the lowest 16-bits. if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) { SrcVT = MVT::v4i32; DstVT = MVT::v8i16; Shuffle = X86ISD::PACKSS; return true; } } // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512() && (32 <= EltSizeInBits || Subtarget.hasBWI()))) { if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, Subtarget)) { SrcVT = DstVT = MaskVT; if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); return true; } } // Attempt to match against a OR if we're performing a blend shuffle and the // non-blended source element is zero in each case. // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits. if (SizeInBits == V1.getValueSizeInBits() && SizeInBits == V2.getValueSizeInBits() && (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { bool IsBlend = true; unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); unsigned NumV2Elts = V2.getValueType().getVectorNumElements(); unsigned Scale1 = NumV1Elts / NumMaskElts; unsigned Scale2 = NumV2Elts / NumMaskElts; APInt DemandedZeroV1 = APInt::getZero(NumV1Elts); APInt DemandedZeroV2 = APInt::getZero(NumV2Elts); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; if (M == SM_SentinelZero) { DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); continue; } if (M == (int)i) { DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); continue; } if (M == (int)(i + NumMaskElts)) { DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); continue; } IsBlend = false; break; } if (IsBlend) { if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) && DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) { Shuffle = ISD::OR; SrcVT = DstVT = MaskVT.changeTypeToInteger(); return true; } if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) { // FIXME: handle mismatched sizes? // TODO: investigate if `ISD::OR` handling in // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead. auto computeKnownBitsElementWise = [&DAG](SDValue V) { unsigned NumElts = V.getValueType().getVectorNumElements(); KnownBits Known(NumElts); for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) { APInt Mask = APInt::getOneBitSet(NumElts, EltIdx); KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask); if (PeepholeKnown.isZero()) Known.Zero.setBit(EltIdx); if (PeepholeKnown.isAllOnes()) Known.One.setBit(EltIdx); } return Known; }; KnownBits V1Known = computeKnownBitsElementWise(V1); KnownBits V2Known = computeKnownBitsElementWise(V2); for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; if (M == SM_SentinelZero) { IsBlend &= V1Known.Zero[i] && V2Known.Zero[i]; continue; } if (M == (int)i) { IsBlend &= V2Known.Zero[i] || V1Known.One[i]; continue; } if (M == (int)(i + NumMaskElts)) { IsBlend &= V1Known.Zero[i] || V2Known.One[i]; continue; } llvm_unreachable("will not get here."); } if (IsBlend) { Shuffle = ISD::OR; SrcVT = DstVT = MaskVT.changeTypeToInteger(); return true; } } } } return false; } static bool matchBinaryPermuteShuffle( MVT MaskVT, ArrayRef Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); // Attempt to match against VALIGND/VALIGNQ rotate. if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) && ((MaskVT.is128BitVector() && Subtarget.hasVLX()) || (MaskVT.is256BitVector() && Subtarget.hasVLX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { if (!isAnyZero(Mask)) { int Rotation = matchShuffleAsElementRotate(V1, V2, Mask); if (0 < Rotation) { Shuffle = X86ISD::VALIGN; if (EltSizeInBits == 64) ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64); else ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32); PermuteImm = Rotation; return true; } } } // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); PermuteImm = ByteRotation; return true; } } // Attempt to combine to X86ISD::BLENDI. if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector TargetMask(Mask); if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, RepeatedMask)) { assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); PermuteImm = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) PermuteImm |= 1 << i; V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::BLENDI; ShuffleVT = MaskVT; return true; } } else { V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; ShuffleVT = MaskVT; return true; } } } // Attempt to combine to INSERTPS, but only if it has elements that need to // be set to zero. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector() && isAnyZero(Mask) && matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; } // Attempt to combine to SHUFPD. if (AllowFloatDomain && EltSizeInBits == 64 && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { bool ForceV1Zero = false, ForceV2Zero = false; if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, PermuteImm, Mask, Zeroable)) { V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; } } // Attempt to combine to SHUFPS. if (AllowFloatDomain && EltSizeInBits == 32 && ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { SmallVector RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { // Match each half of the repeated mask, to determine if its just // referencing one of the vectors, is zeroable or entirely undef. auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { int M0 = RepeatedMask[Offset]; int M1 = RepeatedMask[Offset + 1]; if (isUndefInRange(RepeatedMask, Offset, 2)) { return DAG.getUNDEF(MaskVT); } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { S0 = (SM_SentinelUndef == M0 ? -1 : 0); S1 = (SM_SentinelUndef == M1 ? -1 : 1); return getZeroVector(MaskVT, Subtarget, DAG, DL); } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); return V1; } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); return V2; } return SDValue(); }; int ShufMask[4] = {-1, -1, -1, -1}; SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); if (Lo && Hi) { V1 = Lo; V2 = Hi; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); PermuteImm = getV4X86ShuffleImm(ShufMask); return true; } } } // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector() && matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; return true; } return false; } static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget); /// Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// /// This is the leaf of the recursive combine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); SDLoc DL(Root); MVT RootVT = Root.getSimpleValueType(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); // Canonicalize shuffle input op to the requested type. auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) { if (VT.getSizeInBits() > Op.getValueSizeInBits()) Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits()); else if (VT.getSizeInBits() < Op.getValueSizeInBits()) Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits()); return DAG.getBitcast(VT, Op); }; // Find the inputs that enter the chain. Note that multiple uses are OK // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); SDValue V1 = peekThroughBitcasts(Inputs[0]); SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) : peekThroughBitcasts(Inputs[1])); MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); assert((RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch"); SDValue Res; unsigned NumBaseMaskElts = BaseMask.size(); if (NumBaseMaskElts == 1) { assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); return CanonicalizeShuffleInput(RootVT, V1); } bool OptForSize = DAG.shouldOptForSize(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || (RootVT.isFloatingPoint() && Depth >= 1) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks // from being reused. bool IsMaskedShuffle = false; if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) { if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT && Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) { IsMaskedShuffle = true; } } // If we are shuffling a splat (and not introducing zeros) then we can just // use it directly. This works for smaller elements as well as they already // repeat across each mask element. if (UnaryShuffle && !isAnyZero(BaseMask) && V1.getValueSizeInBits() >= RootSizeInBits && (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && DAG.isSplatValue(V1, /*AllowUndefs*/ false)) { return CanonicalizeShuffleInput(RootVT, V1); } SmallVector Mask(BaseMask); // See if the shuffle is a hidden identity shuffle - repeated args in HOPs // etc. can be simplified. if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) { SmallVector ScaledMask, IdentityMask; unsigned NumElts = VT1.getVectorNumElements(); if (Mask.size() <= NumElts && scaleShuffleElements(Mask, NumElts, ScaledMask)) { for (unsigned i = 0; i != NumElts; ++i) IdentityMask.push_back(i); if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1, V2)) return CanonicalizeShuffleInput(RootVT, V1); } } // Handle 128/256-bit lane shuffles of 512-bit vectors. if (RootVT.is512BitVector() && (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) { // If the upper subvectors are zeroable, then an extract+insert is more // optimal than using X86ISD::SHUF128. The insertion is free, even if it has // to zero the upper subvectors. if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) { if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) return SDValue(); // Nothing to do! assert(isInRange(Mask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"); Res = CanonicalizeShuffleInput(RootVT, V1); unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts); bool UseZero = isAnyZero(Mask); Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits); return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); } // Narrow shuffle mask to v4x128. SmallVector ScaledMask; assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size"); narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask); // Try to lower to vshuf64x2/vshuf32x4. auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, ArrayRef ScaledMask, SDValue V1, SDValue V2, SelectionDAG &DAG) { int PermMask[4] = {-1, -1, -1, -1}; // Ensure elements came from the same Op. SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; for (int i = 0; i < 4; ++i) { assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"); if (ScaledMask[i] < 0) continue; SDValue Op = ScaledMask[i] >= 4 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; else if (Ops[OpIndex] != Op) return SDValue(); PermMask[i] = ScaledMask[i] % 4; } return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, CanonicalizeShuffleInput(ShuffleVT, Ops[0]), CanonicalizeShuffleInput(ShuffleVT, Ops[1]), getV4X86ShuffleImm8ForMask(PermMask, DL, DAG)); }; // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask // doesn't work because our mask is for 128 bits and we don't have an MVT // to match that. bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) && isUndefOrInRange(ScaledMask[1], 0, 2) && isUndefOrInRange(ScaledMask[2], 2, 4) && isUndefOrInRange(ScaledMask[3], 2, 4) && (ScaledMask[0] < 0 || ScaledMask[2] < 0 || ScaledMask[0] == (ScaledMask[2] % 2)) && (ScaledMask[1] < 0 || ScaledMask[3] < 0 || ScaledMask[1] == (ScaledMask[3] % 2)); if (!isAnyZero(ScaledMask) && !PreferPERMQ) { if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG)) return DAG.getBitcast(RootVT, V); } } // Handle 128-bit lane shuffles of 256-bit vectors. if (RootVT.is256BitVector() && NumBaseMaskElts == 2) { // If the upper half is zeroable, then an extract+insert is more optimal // than using X86ISD::VPERM2X128. The insertion is free, even if it has to // zero the upper half. if (isUndefOrZero(Mask[1])) { if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) return SDValue(); // Nothing to do! assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle"); Res = CanonicalizeShuffleInput(RootVT, V1); Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL); return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL, 256); } // If we're inserting the low subvector, an insert-subvector 'concat' // pattern is quicker than VPERM2X128. // TODO: Add AVX2 support instead of VPERMQ/VPERMPD. if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) && !Subtarget.hasAVX2()) { if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) return SDValue(); // Nothing to do! SDValue Lo = CanonicalizeShuffleInput(RootVT, V1); SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2); Hi = extractSubVector(Hi, 0, DAG, DL, 128); return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128); } if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless // we need to use the zeroing feature. // Prefer blends for sequential shuffles unless we are optimizing for size. if (UnaryShuffle && !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) && (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) { unsigned PermMask = 0; PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0); PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4); return DAG.getNode( X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1), DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8)); } if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) return SDValue(); // Nothing to do! // TODO - handle AVX512VL cases with X86ISD::SHUF128. if (!UnaryShuffle && !IsMaskedShuffle) { assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"); // Prefer blends to X86ISD::VPERM2X128. if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) { unsigned PermMask = 0; PermMask |= ((Mask[0] & 3) << 0); PermMask |= ((Mask[1] & 3) << 4); SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2; SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2; return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, LHS), CanonicalizeShuffleInput(RootVT, RHS), DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } // For masks that have been widened to 128-bit elements or more, // narrow back down to 64-bit elements. if (BaseMaskEltSizeInBits > 64) { assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); int MaskScale = BaseMaskEltSizeInBits / 64; SmallVector ScaledMask; narrowShuffleMaskElts(MaskScale, Mask, ScaledMask); Mask = std::move(ScaledMask); } // For masked shuffles, we're trying to match the root width for better // writemask folding, attempt to scale the mask. // TODO - variable shuffles might need this to be widened again. if (IsMaskedShuffle && NumRootElts > Mask.size()) { assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size"); int MaskScale = NumRootElts / Mask.size(); SmallVector ScaledMask; narrowShuffleMaskElts(MaskScale, Mask, ScaledMask); Mask = std::move(ScaledMask); } unsigned NumMaskElts = Mask.size(); unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Determine the effective mask value type. FloatDomain &= (32 <= MaskEltSizeInBits); MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) : MVT::getIntegerVT(MaskEltSizeInBits); MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); // Only allow legal mask types. if (!TLI.isTypeLegal(MaskVT)) return SDValue(); // Attempt to match the mask against known shuffle patterns. MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. // TODO: Should we indicate which domain is preferred if both are allowed? bool AllowFloatDomain = FloatDomain || (Depth >= 3); bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. APInt KnownUndef, KnownZero; resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); APInt Zeroable = KnownUndef | KnownZero; if (UnaryShuffle) { // Attempt to match against broadcast-from-vector. // Limit AVX1 to cases where we're loading+broadcasting a scalar element. if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) && (!IsMaskedShuffle || NumRootElts == NumMaskElts)) { if (isUndefOrEqual(Mask, 0)) { if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && X86::mayFoldLoad(V1.getOperand(0), Subtarget)) { if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } if (Subtarget.hasAVX2()) { if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = CanonicalizeShuffleInput(MaskVT, V1); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } } } if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); return DAG.getBitcast(RootVT, Res); } if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = CanonicalizeShuffleInput(ShuffleVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } // Attempt to combine to INSERTPS, but only if the inserted element has come // from a scalar. // TODO: Handle other insertions here as well? if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && Subtarget.hasSSE41() && !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) { if (MaskEltSizeInBits == 32) { SDValue SrcV1 = V1, SrcV2 = V2; if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) && SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) return SDValue(); // Nothing to do! Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, CanonicalizeShuffleInput(MVT::v4f32, SrcV1), CanonicalizeShuffleInput(MVT::v4f32, SrcV2), DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } if (MaskEltSizeInBits == 64 && isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) && V2.getOpcode() == ISD::SCALAR_TO_VECTOR && V2.getScalarValueSizeInBits() <= 32) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) return SDValue(); // Nothing to do! PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0); Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, CanonicalizeShuffleInput(MVT::v4f32, V1), CanonicalizeShuffleInput(MVT::v4f32, V2), DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2); return DAG.getBitcast(RootVT, Res); } NewV1 = V1; // Save operands in case early exit happens. NewV2 = V2; if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1); NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } // Typically from here on, we need an integer version of MaskVT. MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); // Annoyingly, SSE4A instructions don't map into the above match helpers. if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) { if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = CanonicalizeShuffleInput(IntMaskVT, V1); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = CanonicalizeShuffleInput(IntMaskVT, V1); V2 = CanonicalizeShuffleInput(IntMaskVT, V2); Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } // Match shuffle against TRUNCATE patterns. if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) { // Match against a VTRUNC instruction, accounting for src/dst sizes. if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable, Subtarget)) { bool IsTRUNCATE = ShuffleVT.getVectorNumElements() == ShuffleSrcVT.getVectorNumElements(); unsigned Opc = IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC; if (Depth == 0 && Root.getOpcode() == Opc) return SDValue(); // Nothing to do! V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); Res = DAG.getNode(Opc, DL, ShuffleVT, V1); if (ShuffleVT.getSizeInBits() < RootSizeInBits) Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits); return DAG.getBitcast(RootVT, Res); } // Do we need a more general binary truncation pattern? if (RootSizeInBits < 512 && ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) || (RootVT.is128BitVector() && Subtarget.hasVLX())) && (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) && isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) { // Bail if this was already a truncation or PACK node. // We sometimes fail to match PACK if we demand known undef elements. if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE || Root.getOpcode() == X86ISD::PACKSS || Root.getOpcode() == X86ISD::PACKUS)) return SDValue(); // Nothing to do! ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2); ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts); Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2); Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res); return DAG.getBitcast(RootVT, Res); } } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 1) return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. int VariableCrossLaneShuffleDepth = Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2; int VariablePerLaneShuffleDepth = Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2; AllowVariableCrossLaneMask &= (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask; AllowVariablePerLaneMask &= (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask; // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a // higher depth before combining them. bool AllowBWIVPERMV3 = (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask); bool MaskContainsZeros = isAnyZero(Mask); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) { if (Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) { SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); Res = CanonicalizeShuffleInput(MaskVT, V1); Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); return DAG.getBitcast(RootVT, Res); } // AVX512 variants (non-VLX will pad to 512-bit shuffles). if ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || (Subtarget.hasVBMI() && (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) { V1 = CanonicalizeShuffleInput(MaskVT, V1); V2 = DAG.getUNDEF(MaskVT); Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } } // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero // vector as the second source (non-VLX will pad to 512-bit shuffles). if (UnaryShuffle && AllowVariableCrossLaneMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && AllowBWIVPERMV3 && (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || (Subtarget.hasVBMI() && AllowBWIVPERMV3 && (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { // Adjust shuffle mask - replace SM_SentinelZero with second source index. for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; V1 = CanonicalizeShuffleInput(MaskVT, V1); V2 = getZeroVector(MaskVT, Subtarget, DAG, DL); Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input lane-crossing shuffle then lower to VPERMV3, // (non-VLX will pad to 512-bit shuffles). if (AllowVariableCrossLaneMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasBWI() && AllowBWIVPERMV3 && (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || (Subtarget.hasVBMI() && AllowBWIVPERMV3 && (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { V1 = CanonicalizeShuffleInput(MaskVT, V1); V2 = CanonicalizeShuffleInput(MaskVT, V2); Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } return SDValue(); } // See if we can combine a single input shuffle with zeros to a bit-mask, // which is much simpler than any shuffle. if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask && isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && TLI.isTypeLegal(MaskVT)) { APInt Zero = APInt::getZero(MaskEltSizeInBits); APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits); APInt UndefElts(NumMaskElts, 0); SmallVector EltBits(NumMaskElts, Zero); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { UndefElts.setBit(i); continue; } if (M == SM_SentinelZero) continue; EltBits[i] = AllOnes; } SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); Res = CanonicalizeShuffleInput(MaskVT, V1); unsigned AndOpcode = MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); return DAG.getBitcast(RootVT, Res); } // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector VPermIdx; for (int M : Mask) { SDValue Idx = M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); VPermIdx.push_back(Idx); } SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); Res = CanonicalizeShuffleInput(MaskVT, V1); Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); return DAG.getBitcast(RootVT, Res); } // With XOP, binary shuffles of 128/256-bit floating point vectors can combine // to VPERMIL2PD/VPERMIL2PS. if (AllowVariablePerLaneMask && Subtarget.hasXOP() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v8f32)) { // VPERMIL2 Operation. // Bits[3] - Match Bit. // Bits[2:1] - (Per Lane) PD Shuffle Mask. // Bits[2:0] - (Per Lane) PS Shuffle Mask. unsigned NumLanes = MaskVT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumMaskElts / NumLanes; SmallVector VPerm2Idx; unsigned M2ZImm = 0; for (int M : Mask) { if (M == SM_SentinelUndef) { VPerm2Idx.push_back(-1); continue; } if (M == SM_SentinelZero) { M2ZImm = 2; VPerm2Idx.push_back(8); continue; } int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane); Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); VPerm2Idx.push_back(Index); } V1 = CanonicalizeShuffleInput(MaskVT, V1); V2 = CanonicalizeShuffleInput(MaskVT, V2); SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getTargetConstant(M2ZImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } // If we have 3 or more shuffle instructions or a chain involving a variable // mask, we can replace them with a single PSHUFB instruction profitably. // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. if (UnaryShuffle && AllowVariablePerLaneMask && ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || (RootVT.is256BitVector() && Subtarget.hasAVX2()) || (RootVT.is512BitVector() && Subtarget.hasBWI()))) { SmallVector PSHUFBMask; int NumBytes = RootVT.getSizeInBits() / 8; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; if (M == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } if (M == SM_SentinelZero) { PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; assert((M / 16) == (i / 16) && "Lane crossing detected"); PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); Res = CanonicalizeShuffleInput(ByteVT, V1); SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); return DAG.getBitcast(RootVT, Res); } // With XOP, if we have a 128-bit binary input shuffle we can always combine // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never // slower than PSHUFB on targets that support both. if (AllowVariablePerLaneMask && RootVT.is128BitVector() && Subtarget.hasXOP()) { // VPPERM Mask Operation // Bits[4:0] - Byte Index (0 - 31) // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) SmallVector VPPERMMask; int NumBytes = 16; int Ratio = NumBytes / NumMaskElts; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / Ratio]; if (M == SM_SentinelUndef) { VPPERMMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } if (M == SM_SentinelZero) { VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::v16i8; V1 = CanonicalizeShuffleInput(ByteVT, V1); V2 = CanonicalizeShuffleInput(ByteVT, V2); SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); return DAG.getBitcast(RootVT, Res); } // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input shuffle then lower to VPERMV3, // (non-VLX will pad to 512-bit shuffles) if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || (Subtarget.hasBWI() && AllowBWIVPERMV3 && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || (Subtarget.hasVBMI() && AllowBWIVPERMV3 && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { V1 = CanonicalizeShuffleInput(MaskVT, V1); V2 = CanonicalizeShuffleInput(MaskVT, V2); Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); return DAG.getBitcast(RootVT, Res); } // Failed to find any combines. return SDValue(); } // Combine an arbitrary chain of shuffles + extract_subvectors into a single // instruction if possible. // // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger // type size to attempt to combine: // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) // --> // extract_subvector(shuffle(x,y,m2),0) static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumMaskElts = BaseMask.size(); unsigned NumInputs = Inputs.size(); if (NumInputs == 0) return SDValue(); EVT RootVT = Root.getValueType(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts; assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask"); // Peek through extract_subvector to find widest legal vector. // TODO: Handle ISD::TRUNCATE unsigned WideSizeInBits = RootSizeInBits; for (unsigned I = 0; I != NumInputs; ++I) { SDValue Input = peekThroughBitcasts(Inputs[I]); while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) Input = peekThroughBitcasts(Input.getOperand(0)); if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) && WideSizeInBits < Input.getValueSizeInBits()) WideSizeInBits = Input.getValueSizeInBits(); } // Bail if we fail to find a source larger than the existing root. unsigned Scale = WideSizeInBits / RootSizeInBits; if (WideSizeInBits <= RootSizeInBits || (WideSizeInBits % RootSizeInBits) != 0) return SDValue(); // Create new mask for larger type. SmallVector WideMask(BaseMask); for (int &M : WideMask) { if (M < 0) continue; M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts); } WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); // Attempt to peek through inputs and adjust mask when we extract from an // upper subvector. int AdjustedMasks = 0; SmallVector WideInputs(Inputs.begin(), Inputs.end()); for (unsigned I = 0; I != NumInputs; ++I) { SDValue &Input = WideInputs[I]; Input = peekThroughBitcasts(Input); while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR && Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) { uint64_t Idx = Input.getConstantOperandVal(1); if (Idx != 0) { ++AdjustedMasks; unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits(); Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits; int lo = I * WideMask.size(); int hi = (I + 1) * WideMask.size(); for (int &M : WideMask) if (lo <= M && M < hi) M += Idx; } Input = peekThroughBitcasts(Input.getOperand(0)); } } // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(WideInputs, WideMask); assert(!WideInputs.empty() && "Shuffle with no inputs detected"); // Bail if we're always extracting from the lowest subvectors, // combineX86ShuffleChain should match this for the current width, or the // shuffle still references too many inputs. if (AdjustedMasks == 0 || WideInputs.size() > 2) return SDValue(); // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. while (WideMask.size() > 1) { SmallVector WidenedMask; if (!canWidenShuffleElements(WideMask, WidenedMask)) break; WideMask = std::move(WidenedMask); } // Canonicalization of binary shuffle masks to improve pattern matching by // commuting the inputs. if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) { ShuffleVectorSDNode::commuteMask(WideMask); std::swap(WideInputs[0], WideInputs[1]); } // Increase depth for every upper subvector we've peeked through. Depth += AdjustedMasks; // Attempt to combine wider chain. // TODO: Can we use a better Root? SDValue WideRoot = WideInputs.front().getValueSizeInBits() > WideInputs.back().getValueSizeInBits() ? WideInputs.front() : WideInputs.back(); assert(WideRoot.getValueSizeInBits() == WideSizeInBits && "WideRootSize mismatch"); if (SDValue WideShuffle = combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, HasVariableMask, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) { WideShuffle = extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); return DAG.getBitcast(RootVT, WideShuffle); } return SDValue(); } // Canonicalize the combined shuffle mask chain with horizontal ops. // NOTE: This may update the Ops and Mask. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Mask.empty() || Ops.empty()) return SDValue(); SmallVector BC; for (SDValue Op : Ops) BC.push_back(peekThroughBitcasts(Op)); // All ops must be the same horizop + type. SDValue BC0 = BC[0]; EVT VT0 = BC0.getValueType(); unsigned Opcode0 = BC0.getOpcode(); if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) { return V.getOpcode() != Opcode0 || V.getValueType() != VT0; })) return SDValue(); bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); if (!isHoriz && !isPack) return SDValue(); // Do all ops have a single use? bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) { return Op.hasOneUse() && peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op); }); int NumElts = VT0.getVectorNumElements(); int NumLanes = VT0.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; int NumHalfEltsPerLane = NumEltsPerLane / 2; MVT SrcVT = BC0.getOperand(0).getSimpleValueType(); unsigned EltSizeInBits = RootSizeInBits / Mask.size(); if (NumEltsPerLane >= 4 && (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) { SmallVector LaneMask, ScaledMask; if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) && scaleShuffleElements(LaneMask, 4, ScaledMask)) { // See if we can remove the shuffle by resorting the HOP chain so that // the HOP args are pre-shuffled. // TODO: Generalize to any sized/depth chain. // TODO: Add support for PACKSS/PACKUS. if (isHoriz) { // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand. auto GetHOpSrc = [&](int M) { if (M == SM_SentinelUndef) return DAG.getUNDEF(VT0); if (M == SM_SentinelZero) return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL); SDValue Src0 = BC[M / 4]; SDValue Src1 = Src0.getOperand((M % 4) >= 2); if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode())) return Src1.getOperand(M % 2); return SDValue(); }; SDValue M0 = GetHOpSrc(ScaledMask[0]); SDValue M1 = GetHOpSrc(ScaledMask[1]); SDValue M2 = GetHOpSrc(ScaledMask[2]); SDValue M3 = GetHOpSrc(ScaledMask[3]); if (M0 && M1 && M2 && M3) { SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1); SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3); return DAG.getNode(Opcode0, DL, VT0, LHS, RHS); } } // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc. if (Ops.size() >= 2) { SDValue LHS, RHS; auto GetHOpSrc = [&](int M, int &OutM) { // TODO: Support SM_SentinelZero if (M < 0) return M == SM_SentinelUndef; SDValue Src = BC[M / 4].getOperand((M % 4) >= 2); if (!LHS || LHS == Src) { LHS = Src; OutM = (M % 2); return true; } if (!RHS || RHS == Src) { RHS = Src; OutM = (M % 2) + 2; return true; } return false; }; int PostMask[4] = {-1, -1, -1, -1}; if (GetHOpSrc(ScaledMask[0], PostMask[0]) && GetHOpSrc(ScaledMask[1], PostMask[1]) && GetHOpSrc(ScaledMask[2], PostMask[2]) && GetHOpSrc(ScaledMask[3], PostMask[3])) { LHS = DAG.getBitcast(SrcVT, LHS); RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS); SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS); // Use SHUFPS for the permute so this will work on SSE2 targets, // shuffle combining and domain handling will simplify this later on. MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32); Res = DAG.getBitcast(ShuffleVT, Res); return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res, getV4X86ShuffleImm8ForMask(PostMask, DL, DAG)); } } } } if (2 < Ops.size()) return SDValue(); SDValue BC1 = BC[BC.size() - 1]; if (Mask.size() == VT0.getVectorNumElements()) { // Canonicalize binary shuffles of horizontal ops that use the // same sources to an unary shuffle. // TODO: Try to perform this fold even if the shuffle remains. if (Ops.size() == 2) { auto ContainsOps = [](SDValue HOp, SDValue Op) { return Op == HOp.getOperand(0) || Op == HOp.getOperand(1); }; // Commute if all BC0's ops are contained in BC1. if (ContainsOps(BC1, BC0.getOperand(0)) && ContainsOps(BC1, BC0.getOperand(1))) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(Ops[0], Ops[1]); std::swap(BC0, BC1); } // If BC1 can be represented by BC0, then convert to unary shuffle. if (ContainsOps(BC0, BC1.getOperand(0)) && ContainsOps(BC0, BC1.getOperand(1))) { for (int &M : Mask) { if (M < NumElts) // BC0 element or UNDEF/Zero sentinel. continue; int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0; M -= NumElts + (SubLane * NumHalfEltsPerLane); if (BC1.getOperand(SubLane) != BC0.getOperand(0)) M += NumHalfEltsPerLane; } } } // Canonicalize unary horizontal ops to only refer to lower halves. for (int i = 0; i != NumElts; ++i) { int &M = Mask[i]; if (isUndefOrZero(M)) continue; if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) && (M % NumEltsPerLane) >= NumHalfEltsPerLane) M -= NumHalfEltsPerLane; if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) && (M % NumEltsPerLane) >= NumHalfEltsPerLane) M -= NumHalfEltsPerLane; } } // Combine binary shuffle of 2 similar 'Horizontal' instructions into a // single instruction. Attempt to match a v2X64 repeating shuffle pattern that // represents the LHS/RHS inputs for the lower/upper halves. SmallVector TargetMask128, WideMask128; if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) && scaleShuffleElements(TargetMask128, 2, WideMask128)) { assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle"); bool SingleOp = (Ops.size() == 1); if (isPack || OneUseOps || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) { SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1; SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1; Lo = Lo.getOperand(WideMask128[0] & 1); Hi = Hi.getOperand(WideMask128[1] & 1); if (SingleOp) { SDValue Undef = DAG.getUNDEF(SrcVT); SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL); Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo); Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi); Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo); Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi); } return DAG.getNode(Opcode0, DL, VT0, Lo, Hi); } } // If we are post-shuffling a 256-bit hop and not requiring the upper // elements, then try to narrow to a 128-bit hop directly. SmallVector WideMask64; if (Ops.size() == 1 && NumLanes == 2 && scaleShuffleElements(Mask, 4, WideMask64) && isUndefInRange(WideMask64, 2, 2)) { int M0 = WideMask64[0]; int M1 = WideMask64[1]; if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) { MVT HalfVT = VT0.getSimpleVT().getHalfNumVectorElementsVT(); unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0; unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0; SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL); SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL); SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1); return widenSubVector(Res, false, Subtarget, DAG, DL, 256); } } return SDValue(); } // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, ArrayRef Mask, bool HasVariableMask, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { unsigned SizeInBits = VT.getSizeInBits(); unsigned NumMaskElts = Mask.size(); unsigned MaskSizeInBits = SizeInBits / NumMaskElts; unsigned NumOps = Ops.size(); // Extract constant bits from each source op. SmallVector UndefEltsOps(NumOps); SmallVector, 16> RawBitsOps(NumOps); for (unsigned I = 0; I != NumOps; ++I) if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I], RawBitsOps[I], /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) return SDValue(); // If we're optimizing for size, only fold if at least one of the constants is // only used once or the combined shuffle has included a variable mask // shuffle, this is to avoid constant pool bloat. bool IsOptimizingSize = DAG.shouldOptForSize(); if (IsOptimizingSize && !HasVariableMask && llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); })) return SDValue(); // Shuffle the constant bits according to the mask. APInt UndefElts(NumMaskElts, 0); APInt ZeroElts(NumMaskElts, 0); APInt ConstantElts(NumMaskElts, 0); SmallVector ConstantBitData(NumMaskElts, APInt::getZero(MaskSizeInBits)); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { UndefElts.setBit(i); continue; } else if (M == SM_SentinelZero) { ZeroElts.setBit(i); continue; } assert(0 <= M && M < (int)(NumMaskElts * NumOps)); unsigned SrcOpIdx = (unsigned)M / NumMaskElts; unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; if (SrcUndefElts[SrcMaskIdx]) { UndefElts.setBit(i); continue; } auto &SrcEltBits = RawBitsOps[SrcOpIdx]; APInt &Bits = SrcEltBits[SrcMaskIdx]; if (!Bits) { ZeroElts.setBit(i); continue; } ConstantElts.setBit(i); ConstantBitData[i] = Bits; } assert((UndefElts | ZeroElts | ConstantElts).isAllOnes()); // Attempt to create a zero vector. if ((UndefElts | ZeroElts).isAllOnes()) return getZeroVector(VT, Subtarget, DAG, DL); // Create the constant data. MVT MaskSVT; if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); else MaskSVT = MVT::getIntegerVT(MaskSizeInBits); MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) return SDValue(); SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); return DAG.getBitcast(VT, CstOp); } namespace llvm { namespace X86 { enum { MaxShuffleCombineDepth = 8 }; } // namespace X86 } // namespace llvm /// Fully generic combining of x86 shuffle instructions. /// /// This should be the last combine run over the x86 shuffle instructions. Once /// they have been fully optimized, this will recursively consider all chains /// of single-use shuffle instructions, build a generic model of the cumulative /// shuffle operation, and check for simpler instructions which implement this /// operation. We use this primarily for two purposes: /// /// 1) Collapse generic shuffles to specialized single instructions when /// equivalent. In most cases, this is just an encoding size win, but /// sometimes we will collapse multiple generic shuffles into a single /// special-purpose shuffle. /// 2) Look for sequences of shuffle instructions with 3 or more total /// instructions, and replace them with the slightly more expensive SSSE3 /// PSHUFB instruction if available. We do this as the last combining step /// to ensure we avoid using PSHUFB if we can implement the shuffle with /// a suitable short sequence of other instructions. The PSHUFB will either /// use a register or have to read from memory and so is slightly (but only /// slightly) more expensive than the other shuffle instructions. /// /// Because this is inherently a quadratic operation (for each shuffle in /// a chain, we recurse up the chain), the depth is limited to 8 instructions. /// This should never be an issue in practice as the shuffle lowering doesn't /// produce sequences of more than 8 instructions. /// /// FIXME: We will currently miss some cases where the redundant shuffling /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"); MVT RootVT = Root.getSimpleValueType(); assert(RootVT.isVector() && "Shuffles operate on vector types!"); unsigned RootSizeInBits = RootVT.getSizeInBits(); SDLoc DL(Root); // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. if (Depth >= MaxDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. SDValue Op = SrcOps[SrcOpIndex]; Op = peekThroughOneUseBitcasts(Op); EVT VT = Op.getValueType(); if (!VT.isVector() || !VT.isSimple()) return SDValue(); // Bail if we hit a non-simple non-vector. // FIXME: Just bail on f16 for now. if (VT.getVectorElementType() == MVT::f16) return SDValue(); assert((RootSizeInBits % VT.getSizeInBits()) == 0 && "Can only combine shuffles upto size of the root op."); // Create a demanded elts mask from the referenced elements of Op. APInt OpDemandedElts = APInt::getZero(RootMask.size()); for (int M : RootMask) { int BaseIdx = RootMask.size() * SrcOpIndex; if (isInRange(M, BaseIdx, BaseIdx + RootMask.size())) OpDemandedElts.setBit(M - BaseIdx); } if (RootSizeInBits != VT.getSizeInBits()) { // Op is smaller than Root - extract the demanded elts for the subvector. unsigned Scale = RootSizeInBits / VT.getSizeInBits(); unsigned NumOpMaskElts = RootMask.size() / Scale; assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch"); assert(OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"); OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0); } OpDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements()); // Extract target shuffle mask and resolve sentinels and inputs. SmallVector OpMask; SmallVector OpInputs; APInt OpUndef, OpZero; bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, OpZero, DAG, Depth, false)) { // Shuffle inputs must not be larger than the shuffle result. // TODO: Relax this for single input faux shuffles (e.g. trunc). if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { return OpInput.getValueSizeInBits() > VT.getSizeInBits(); })) return SDValue(); } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 && !isNullConstant(Op.getOperand(1))) { SDValue SrcVec = Op.getOperand(0); int ExtractIdx = Op.getConstantOperandVal(1); unsigned NumElts = VT.getVectorNumElements(); OpInputs.assign({SrcVec}); OpMask.assign(NumElts, SM_SentinelUndef); std::iota(OpMask.begin(), OpMask.end(), ExtractIdx); OpZero = OpUndef = APInt::getZero(NumElts); } else { return SDValue(); } // If the shuffle result was smaller than the root, we need to adjust the // mask indices and pad the mask with undefs. if (RootSizeInBits > VT.getSizeInBits()) { unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits(); unsigned OpMaskSize = OpMask.size(); if (OpInputs.size() > 1) { unsigned PaddedMaskSize = NumSubVecs * OpMaskSize; for (int &M : OpMask) { if (M < 0) continue; int EltIdx = M % OpMaskSize; int OpIdx = M / OpMaskSize; M = (PaddedMaskSize * OpIdx) + EltIdx; } } OpZero = OpZero.zext(NumSubVecs * OpMaskSize); OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize); OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef); } SmallVector Mask; SmallVector Ops; // We don't need to merge masks if the root is empty. bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1); if (EmptyRoot) { // Only resolve zeros if it will remove an input, otherwise we might end // up in an infinite loop. bool ResolveKnownZeros = true; if (!OpZero.isZero()) { APInt UsedInputs = APInt::getZero(OpInputs.size()); for (int i = 0, e = OpMask.size(); i != e; ++i) { int M = OpMask[i]; if (OpUndef[i] || OpZero[i] || isUndefOrZero(M)) continue; UsedInputs.setBit(M / OpMask.size()); if (UsedInputs.isAllOnes()) { ResolveKnownZeros = false; break; } } } resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero, ResolveKnownZeros); Mask = OpMask; Ops.append(OpInputs.begin(), OpInputs.end()); } else { resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); // Add the inputs to the Ops list, avoiding duplicates. Ops.append(SrcOps.begin(), SrcOps.end()); auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { // Attempt to find an existing match. SDValue InputBC = peekThroughBitcasts(Input); for (int i = 0, e = Ops.size(); i < e; ++i) if (InputBC == peekThroughBitcasts(Ops[i])) return i; // Match failed - should we replace an existing Op? if (InsertionPoint >= 0) { Ops[InsertionPoint] = Input; return InsertionPoint; } // Add to the end of the Ops list. Ops.push_back(Input); return Ops.size() - 1; }; SmallVector OpInputIdx; for (SDValue OpInput : OpInputs) OpInputIdx.push_back( AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); assert(((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger."); // This function can be performance-critical, so we rely on the power-of-2 // knowledge that we have about the mask sizes to replace div/rem ops with // bit-masks and shifts. assert(llvm::has_single_bit(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); assert(llvm::has_single_bit(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size()); unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size()); unsigned MaskWidth = std::max(OpMask.size(), RootMask.size()); unsigned RootRatio = std::max(1, OpMask.size() >> RootMaskSizeLog2); unsigned OpRatio = std::max(1, RootMask.size() >> OpMaskSizeLog2); assert((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"); assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); unsigned RootRatioLog2 = llvm::countr_zero(RootRatio); unsigned OpRatioLog2 = llvm::countr_zero(OpRatio); Mask.resize(MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by // the root mask to get us all the way to the root value arrangement. The // reason for this order is that we are recursing up the operation chain. for (unsigned i = 0; i < MaskWidth; ++i) { unsigned RootIdx = i >> RootRatioLog2; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. Mask[i] = RootMask[RootIdx]; continue; } unsigned RootMaskedIdx = RootRatio == 1 ? RootMask[RootIdx] : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); // Just insert the scaled root mask value if it references an input other // than the SrcOp we're currently inserting. if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { Mask[i] = RootMaskedIdx; continue; } RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. Mask[i] = OpMask[OpIdx]; continue; } // Ok, we have non-zero lanes, map them through to one of the Op's inputs. unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx] : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; Mask[i] = OpMaskedIdx; } } // Peek through vector widenings and set out of bounds mask indices to undef. // TODO: Can resolveTargetShuffleInputsAndMask do some of this? for (unsigned I = 0, E = Ops.size(); I != E; ++I) { SDValue &Op = Ops[I]; if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() && isNullConstant(Op.getOperand(2))) { Op = Op.getOperand(1); unsigned Scale = RootSizeInBits / Op.getValueSizeInBits(); int Lo = I * Mask.size(); int Hi = (I + 1) * Mask.size(); int NewHi = Lo + (Mask.size() / Scale); for (int &M : Mask) { if (Lo <= M && NewHi <= M && M < Hi) M = SM_SentinelUndef; } } } // Peek through any free extract_subvector nodes back to root size. for (SDValue &Op : Ops) while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 && isNullConstant(Op.getOperand(1))) Op = Op.getOperand(0); // Remove unused/repeated shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); // Handle the all undef/zero/ones cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(RootVT); if (all_of(Mask, [](int Idx) { return Idx < 0; })) return getZeroVector(RootVT, Subtarget, DAG, DL); if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) && !llvm::is_contained(Mask, SM_SentinelZero)) return getOnesVector(RootVT, DAG, DL); assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= IsOpVariableMask; // Update the list of shuffle nodes that have been combined so far. SmallVector CombinedNodes(SrcNodes.begin(), SrcNodes.end()); CombinedNodes.push_back(Op.getNode()); // See if we can recurse into each shuffle source op (if it's a target // shuffle). The source op should only be generally combined if it either has // a single use (i.e. current Op) or all its users have already been combined, // if not then we can still combine but should prevent generation of variable // shuffles to avoid constant pool bloat. // Don't recurse if we already have more source ops than we can combine in // the remaining recursion depth. if (Ops.size() < (MaxDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { // For empty roots, we need to resolve zeroable elements before combining // them with other shuffles. SmallVector ResolvedMask = Mask; if (EmptyRoot) resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero); bool AllowCrossLaneVar = false; bool AllowPerLaneVar = false; if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) { AllowCrossLaneVar = AllowVariableCrossLaneMask; AllowPerLaneVar = AllowVariablePerLaneMask; } if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG, Subtarget)) return Res; } } // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget)) return Cst; // If constant fold failed and we only have constants - then we have // multiple uses by a single non-variable shuffle - just bail. if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) { APInt UndefElts; SmallVector RawBits; unsigned EltSizeInBits = RootSizeInBits / Mask.size(); return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, RawBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true); })) { return SDValue(); } // Canonicalize the combined shuffle mask chain with horizontal ops. // NOTE: This will update the Ops and Mask. if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( Ops, Mask, RootSizeInBits, DL, DAG, Subtarget)) return DAG.getBitcast(RootVT, HOp); // Try to refine our inputs given our knowledge of target shuffle mask. for (auto I : enumerate(Ops)) { int OpIdx = I.index(); SDValue &Op = I.value(); // What range of shuffle mask element values results in picking from Op? int Lo = OpIdx * Mask.size(); int Hi = Lo + Mask.size(); // Which elements of Op do we demand, given the mask's granularity? APInt OpDemandedElts(Mask.size(), 0); for (int MaskElt : Mask) { if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op? int OpEltIdx = MaskElt - Lo; OpDemandedElts.setBit(OpEltIdx); } } // Is the shuffle result smaller than the root? if (Op.getValueSizeInBits() < RootSizeInBits) { // We padded the mask with undefs. But we now need to undo that. unsigned NumExpectedVectorElts = Mask.size(); unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts; unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits; assert(!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"); OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW } // The Op itself may be of different VT, so we need to scale the mask. unsigned NumOpElts = Op.getValueType().getVectorNumElements(); APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts); // Can this operand be simplified any further, given it's demanded elements? if (SDValue NewOp = DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( Op, OpScaledDemandedElts, DAG)) Op = NewOp; } // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? // Widen any subvector shuffle inputs we've collected. // TODO: Remove this to avoid generating temporary nodes, we should only // widen once combineX86ShuffleChain has found a match. if (any_of(Ops, [RootSizeInBits](SDValue Op) { return Op.getValueSizeInBits() < RootSizeInBits; })) { for (SDValue &Op : Ops) if (Op.getValueSizeInBits() < RootSizeInBits) Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), RootSizeInBits); // Reresolve - we might have repeated subvector sources. resolveTargetShuffleInputsAndMask(Ops, Mask); } // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. while (Mask.size() > 1) { SmallVector WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) break; Mask = std::move(WidenedMask); } // Canonicalization of binary shuffle masks to improve pattern matching by // commuting the inputs. if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(Ops[0], Ops[1]); } // Try to combine into a single shuffle instruction. if (SDValue Shuffle = combineX86ShuffleChain( Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return Shuffle; // If all the operands come from the same larger vector, fallthrough and try // to use combineX86ShuffleChainWithExtract. SDValue LHS = peekThroughBitcasts(Ops.front()); SDValue RHS = peekThroughBitcasts(Ops.back()); if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 || (RootSizeInBits / Mask.size()) != 64 || LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || LHS.getOperand(0) != RHS.getOperand(0)) return SDValue(); } // If that failed and any input is extracted then try to combine as a // shuffle with the larger type. return combineX86ShuffleChainWithExtract( Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget); } /// Helper entry wrapper to combineX86ShufflesRecursively. static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget); } /// Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 /// PSHUF-style masks that can be reused with such instructions. static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; SmallVector Ops; bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask); (void)HaveMask; assert(HaveMask); // If we have more than 128-bits, only the low 128-bits of shuffle mask // matter. Check that the upper masks are repeats and remove them. if (VT.getSizeInBits() > 128) { int LaneElts = 128 / VT.getScalarSizeInBits(); #ifndef NDEBUG for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) for (int j = 0; j < LaneElts; ++j) assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"); #endif Mask.resize(LaneElts); } switch (N.getOpcode()) { case X86ISD::PSHUFD: return Mask; case X86ISD::PSHUFLW: Mask.resize(4); return Mask; case X86ISD::PSHUFHW: Mask.erase(Mask.begin(), Mask.begin() + 4); for (int &M : Mask) M -= 4; return Mask; default: llvm_unreachable("No valid shuffle instruction found!"); } } /// Search for a combinable shuffle across a chain ending in pshufd. /// /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); // Walk up a single-use chain looking for a combinable shuffle. Keep a stack // of the shuffles in the chain so that we can form a fresh chain to replace // this one. SmallVector Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific // instructions. continue; case X86ISD::PSHUFD: // Found another dword shuffle. break; case X86ISD::PSHUFLW: // Check that the low words (being shuffled) are the identity in the // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) return SDValue(); Chain.push_back(V); continue; case X86ISD::PSHUFHW: // Check that the high words (being shuffled) are the identity in the // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) return SDValue(); Chain.push_back(V); continue; case X86ISD::UNPCKL: case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) return SDValue(); Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; Chain.push_back(V); [[fallthrough]]; case ISD::BITCAST: V = V.getOperand(0); continue; } break; } while (V.hasOneUse()); break; } // Break out of the loop if we break out of the switch. break; } if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. return SDValue(); // Merge this node's mask and our incoming mask. SmallVector VMask = getPSHUFShuffleMask(V); for (int &M : Mask) M = VMask[M]; V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); // Rebuild the chain around this new shuffle. while (!Chain.empty()) { SDValue W = Chain.pop_back_val(); if (V.getValueType() != W.getOperand(0).getValueType()) V = DAG.getBitcast(W.getOperand(0).getValueType(), V); switch (W.getOpcode()) { default: llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); case X86ISD::UNPCKL: case X86ISD::UNPCKH: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); break; case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); break; } } if (V.getValueType() != N.getValueType()) V = DAG.getBitcast(N.getValueType(), V); // Return the new chain to replace N. return V; } // Attempt to commute shufps LHS loads: // permilps(shufps(load(),x)) --> permilps(shufps(x,load())) static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG) { // TODO: Add vXf64 support. if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32) return SDValue(); // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not. auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) { if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode())) return SDValue(); SDValue N0 = V.getOperand(0); SDValue N1 = V.getOperand(1); unsigned Imm = V.getConstantOperandVal(2); const X86Subtarget &Subtarget = DAG.getSubtarget(); if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, DAG.getTargetConstant(Imm, DL, MVT::i8)); }; switch (N.getOpcode()) { case X86ISD::VPERMILPI: if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) { unsigned Imm = N.getConstantOperandVal(1); return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP, DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); } break; case X86ISD::SHUFP: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); unsigned Imm = N.getConstantOperandVal(2); if (N0 == N1) { if (SDValue NewSHUFP = commuteSHUFP(N, N0)) return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP, DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) { return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1, DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8)); } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) { return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP, DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8)); } break; } } return SDValue(); } // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y)) // iff we don't demand the same element index for both X and Y. static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected"); if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); SDValue BC0 = peekThroughOneUseBitcasts(N0); SDValue BC1 = peekThroughOneUseBitcasts(N1); // See if both operands are shuffles, and that we can scale the shuffle masks // to the same width as the blend mask. // TODO: Support SM_SentinelZero? SmallVector Ops0, Ops1; SmallVector Mask0, Mask1, ScaledMask0, ScaledMask1; if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) || !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) || !scaleShuffleElements(Mask0, NumElts, ScaledMask0) || !scaleShuffleElements(Mask1, NumElts, ScaledMask1)) return SDValue(); // Determine the demanded elts from both permutes. APInt Demanded0, DemandedLHS0, DemandedRHS0; APInt Demanded1, DemandedLHS1, DemandedRHS1; if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0, Demanded1, /*AllowUndefElts=*/true) || !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0, DemandedRHS0, /*AllowUndefElts=*/true) || !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1, DemandedRHS1, /*AllowUndefElts=*/true)) return SDValue(); // Confirm that we only use a single operand from both permutes and that we // don't demand the same index from both. if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() || DemandedLHS0.intersects(DemandedLHS1)) return SDValue(); // Use the permute demanded elts masks as the new blend mask. // Create the new permute mask as a blend of the 2 original permute masks. SmallVector NewBlendMask(NumElts, SM_SentinelUndef); SmallVector NewPermuteMask(NumElts, SM_SentinelUndef); for (unsigned I = 0; I != NumElts; ++I) { if (Demanded0[I]) { int M = ScaledMask0[I]; if (0 <= M) { assert(isUndefOrEqual(NewBlendMask[M], M) && "BlendMask demands LHS AND RHS"); NewBlendMask[M] = M; NewPermuteMask[I] = M; } } else if (Demanded1[I]) { int M = ScaledMask1[I]; if (0 <= M) { assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) && "BlendMask demands LHS AND RHS"); NewBlendMask[M] = M + NumElts; NewPermuteMask[I] = M; } } } assert(isBlendOrUndef(NewBlendMask) && "Bad blend"); assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute"); // v16i16 shuffles can explode in complexity very easily, only accept them if // the blend mask is the same in the 128-bit subvectors (or can widen to // v8i32) and the permute can be widened as well. if (VT == MVT::v16i16) { if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) && !canWidenShuffleElements(NewBlendMask)) return SDValue(); if (!canWidenShuffleElements(NewPermuteMask)) return SDValue(); } // Don't introduce lane-crossing permutes without AVX2, unless it can be // widened to a lane permute (vperm2f128). if (VT.is256BitVector() && !Subtarget.hasAVX2() && isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), NewPermuteMask) && !canScaleShuffleElements(NewPermuteMask, 2)) return SDValue(); SDValue NewBlend = DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]), DAG.getBitcast(VT, Ops1[0]), NewBlendMask); return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT), NewPermuteMask); } // TODO - move this to TLI like isBinOp? static bool isUnaryOp(unsigned Opcode) { switch (Opcode) { case ISD::CTLZ: case ISD::CTTZ: case ISD::CTPOP: return true; } return false; } // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)). // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ShuffleVT = N.getValueType(); unsigned Opc = N.getOpcode(); auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true, bool FoldLoad = false) { // AllZeros/AllOnes constants are freely shuffled and will peek through // bitcasts. Other constant build vectors do not peek through bitcasts. Only // merge with target shuffles if it has one use so shuffle combining is // likely to kick in. Shuffles of splats are expected to be removed. return ISD::isBuildVectorAllOnes(Op.getNode()) || ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) || getTargetConstantFromNode(dyn_cast(Op)) || (Op.getOpcode() == Opc && Op->hasOneUse()) || (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) || (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) || (FoldLoad && isShuffleFoldableLoad(Op)) || DAG.isSplatValue(Op, /*AllowUndefs*/ false); }; auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { // Ensure we only shuffle whole vector src elements, unless its a logical // binops where we can more aggressively move shuffles from dst to src. return isLogicOp(BinOp) || (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits()); }; switch (Opc) { // Unary and Unary+Permute Shuffles. case X86ISD::PSHUFB: { // Don't merge PSHUFB if it contains zero'd elements. SmallVector Mask; SmallVector Ops; if (!getTargetShuffleMask(N, false, Ops, Mask)) break; [[fallthrough]]; } case X86ISD::VBROADCAST: case X86ISD::MOVDDUP: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMI: case X86ISD::VPERMILPI: { if (N.getOperand(0).getValueType() == ShuffleVT && N->isOnlyUserOf(N.getOperand(0).getNode())) { SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); unsigned SrcOpcode = N0.getOpcode(); if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI, Opc != X86ISD::PSHUFB) || IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI, Opc != X86ISD::PSHUFB)) { SDValue LHS, RHS; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op01 = DAG.getBitcast(ShuffleVT, Op01); if (N.getNumOperands() == 2) { LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1)); RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1)); } else { LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00); RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01); } EVT OpVT = N0.getValueType(); return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, LHS), DAG.getBitcast(OpVT, RHS))); } } } break; } // Binary and Binary+Permute Shuffles. case X86ISD::INSERTPS: { // Don't merge INSERTPS if it contains zero'd elements. unsigned InsertPSMask = N.getConstantOperandVal(2); unsigned ZeroMask = InsertPSMask & 0xF; if (ZeroMask != 0) break; [[fallthrough]]; } case X86ISD::MOVSD: case X86ISD::MOVSS: case X86ISD::BLENDI: case X86ISD::SHUFP: case X86ISD::UNPCKH: case X86ISD::UNPCKL: { if (N->isOnlyUserOf(N.getOperand(0).getNode()) && N->isOnlyUserOf(N.getOperand(1).getNode())) { SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); unsigned SrcOpcode = N0.getOpcode(); if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode && N0.getValueType() == N1.getValueType() && IsSafeToMoveShuffle(N0, SrcOpcode) && IsSafeToMoveShuffle(N1, SrcOpcode)) { SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0)); SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1)); // Ensure the total number of shuffles doesn't increase by folding this // shuffle through to the source ops. if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) || (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) || ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) && (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) { SDValue LHS, RHS; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op10 = DAG.getBitcast(ShuffleVT, Op10); Op01 = DAG.getBitcast(ShuffleVT, Op01); Op11 = DAG.getBitcast(ShuffleVT, Op11); if (N.getNumOperands() == 3) { LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2)); RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2)); } else { LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10); RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11); } EVT OpVT = N0.getValueType(); return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, LHS), DAG.getBitcast(OpVT, RHS))); } } if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode && N0.getValueType() == N1.getValueType() && IsSafeToMoveShuffle(N0, SrcOpcode) && IsSafeToMoveShuffle(N1, SrcOpcode)) { SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0)); SDValue Res; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op10 = DAG.getBitcast(ShuffleVT, Op10); if (N.getNumOperands() == 3) { Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2)); } else { Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10); } EVT OpVT = N0.getValueType(); return DAG.getBitcast( ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res))); } } break; } } return SDValue(); } /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()). static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL) { assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle"); MVT VT = V.getSimpleValueType(); SDValue Src0 = peekThroughBitcasts(V.getOperand(0)); SDValue Src1 = peekThroughBitcasts(V.getOperand(1)); unsigned SrcOpc0 = Src0.getOpcode(); unsigned SrcOpc1 = Src1.getOpcode(); EVT SrcVT0 = Src0.getValueType(); EVT SrcVT1 = Src1.getValueType(); if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1)) return SDValue(); switch (SrcOpc0) { case X86ISD::MOVDDUP: { SDValue LHS = Src0.getOperand(0); SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0); SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2)); Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res); return DAG.getBitcast(VT, Res); } case X86ISD::VPERMILPI: // TODO: Handle v4f64 permutes with different low/high lane masks. if (SrcVT0 == MVT::v4f64) { uint64_t Mask = Src0.getConstantOperandVal(1); if ((Mask & 0x3) != ((Mask >> 2) & 0x3)) break; } [[fallthrough]]; case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: case X86ISD::PSHUFD: if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { SDValue LHS = Src0.getOperand(0); SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0); SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2)); Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1)); return DAG.getBitcast(VT, Res); } break; } return SDValue(); } /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MVT VT = N.getSimpleValueType(); SmallVector Mask; unsigned Opcode = N.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) return R; // Handle specific target shuffles. switch (Opcode) { case X86ISD::MOVDDUP: { SDValue Src = N.getOperand(0); // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload. if (VT == MVT::v2f64 && Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) { SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad); DCI.CombineTo(N.getNode(), Movddup); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return N; // Return N so it doesn't get rechecked! } } return SDValue(); } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); SDValue BC = peekThroughBitcasts(Src); EVT SrcVT = Src.getValueType(); EVT BCVT = BC.getValueType(); // If broadcasting from another shuffle, attempt to simplify it. // TODO - we really need a general SimplifyDemandedVectorElts mechanism. if (isTargetShuffle(BC.getOpcode()) && VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); SmallVector DemandedMask(BCVT.getVectorNumElements(), SM_SentinelUndef); for (unsigned i = 0; i != Scale; ++i) DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } // broadcast(bitcast(src)) -> bitcast(broadcast(src)) // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. if (Src.getOpcode() == ISD::BITCAST && SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() && TLI.isTypeLegal(BCVT) && FixedVectorType::isValidElementType( BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) { EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), VT.getVectorNumElements()); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); } // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src)) // If we're re-broadcasting a smaller type then broadcast with that type and // bitcast. // TODO: Do this for any splat? if (Src.getOpcode() == ISD::BITCAST && (BC.getOpcode() == X86ISD::VBROADCAST || BC.getOpcode() == X86ISD::VBROADCAST_LOAD) && (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 && (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) { MVT NewVT = MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(), VT.getSizeInBits() / BCVT.getScalarSizeInBits()); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); } // Reduce broadcast source vector to lowest 128-bits. if (SrcVT.getSizeInBits() > 128) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, extract128BitVector(Src, 0, DAG, DL)); // broadcast(scalar_to_vector(x)) -> broadcast(x). if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR && Src.getValueType().getScalarType() == Src.getOperand(0).getValueType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); // broadcast(extract_vector_elt(x, 0)) -> broadcast(x). if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(Src.getOperand(1)) && Src.getValueType() == Src.getOperand(0).getValueType().getScalarType() && TLI.isTypeLegal(Src.getOperand(0).getValueType())) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); // Share broadcast with the longest vector and extract low subvector (free). // Ensure the same SDValue from the SDNode use is being used. for (SDNode *User : Src->uses()) if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && Src == User->getOperand(0) && User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) { return extractSubVector(SDValue(User, 0), 0, DAG, DL, VT.getSizeInBits()); } // vbroadcast(scalarload X) -> vbroadcast_load X // For float loads, extract other uses of the scalar from the broadcast. if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceExtract = Src.hasOneUse(); DCI.CombineTo(N.getNode(), BcastLd); if (NoReplaceExtract) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); } else { SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd, DAG.getIntPtrConstant(0, DL)); DCI.CombineTo(LN, Scl, BcastLd.getValue(1)); } return N; // Return N so it doesn't get rechecked! } // Due to isTypeDesirableForOp, we won't always shrink a load truncated to // i16. So shrink it ourselves if we can make a broadcast_load. if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && Src.getOperand(0).hasOneUse()) { assert(Subtarget.hasAVX2() && "Expected AVX2"); SDValue TruncIn = Src.getOperand(0); // If this is a truncate of a non extending load we can just narrow it to // use a broadcast_load. if (ISD::isNormalLoad(TruncIn.getNode())) { LoadSDNode *LN = cast(TruncIn); // Unless its volatile or atomic. if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, LN->getPointerInfo(), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); DCI.recursivelyDeleteUnusedNodes(Src.getNode()); return N; // Return N so it doesn't get rechecked! } } // If this is a truncate of an i16 extload, we can directly replace it. if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) && ISD::isEXTLoad(Src.getOperand(0).getNode())) { LoadSDNode *LN = cast(Src.getOperand(0)); if (LN->getMemoryVT().getSizeInBits() == 16) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); DCI.recursivelyDeleteUnusedNodes(Src.getNode()); return N; // Return N so it doesn't get rechecked! } } // If this is a truncate of load that has been shifted right, we can // offset the pointer and use a narrower load. if (TruncIn.getOpcode() == ISD::SRL && TruncIn.getOperand(0).hasOneUse() && isa(TruncIn.getOperand(1)) && ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) { LoadSDNode *LN = cast(TruncIn.getOperand(0)); unsigned ShiftAmt = TruncIn.getConstantOperandVal(1); // Make sure the shift amount and the load size are divisible by 16. // Don't do this if the load is volatile or atomic. if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 && LN->isSimple()) { unsigned Offset = ShiftAmt / 8; SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ptr = DAG.getMemBasePlusOffset( LN->getBasePtr(), TypeSize::getFixed(Offset), DL); SDValue Ops[] = { LN->getChain(), Ptr }; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, LN->getPointerInfo().getWithOffset(Offset), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); DCI.recursivelyDeleteUnusedNodes(Src.getNode()); return N; // Return N so it doesn't get rechecked! } } } // vbroadcast(vzload X) -> vbroadcast_load X if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) { MemSDNode *LN = cast(Src); if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return N; // Return N so it doesn't get rechecked! } } // vbroadcast(vector load X) -> vbroadcast_load if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 || SrcVT == MVT::v4i32) && Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); // Unless the load is volatile or atomic. if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BcastLd = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(), LN->getPointerInfo(), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); DCI.CombineTo(N.getNode(), BcastLd); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return N; // Return N so it doesn't get rechecked! } } return SDValue(); } case X86ISD::VZEXT_MOVL: { SDValue N0 = N.getOperand(0); // If this a vzmovl of a full vector load, replace it with a vzload, unless // the load is volatile. if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) { auto *LN = cast(N0); if (SDValue VZLoad = narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) { DCI.CombineTo(N.getNode(), VZLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return N; } } // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast // and can just use a VZEXT_LOAD. // FIXME: Is there some way to do this with SimplifyDemandedVectorElts? if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) { auto *LN = cast(N0); if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue VZLoad = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DCI.CombineTo(N.getNode(), VZLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return N; } } // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X))))))) // if the upper bits of the i64 are zero. if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR && N0.getOperand(0).hasOneUse() && N0.getOperand(0).getValueType() == MVT::i64) { SDValue In = N0.getOperand(0); APInt Mask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(In, Mask)) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In); MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc); SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec); return DAG.getBitcast(VT, Movl); } } // Load a scalar integer constant directly to XMM instead of transferring an // immediate value from GPR. // vzext_movl (scalar_to_vector C) --> load [C,0...] if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) { if (auto *C = dyn_cast(N0.getOperand(0))) { // Create a vector constant - scalar constant followed by zeros. EVT ScalarVT = N0.getOperand(0).getValueType(); Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext()); unsigned NumElts = VT.getVectorNumElements(); Constant *Zero = ConstantInt::getNullValue(ScalarTy); SmallVector ConstantVec(NumElts, Zero); ConstantVec[0] = const_cast(C->getConstantIntValue()); // Load the vector constant from constant pool. MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); Align Alignment = cast(CP)->getAlign(); return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment, MachineMemOperand::MOLoad); } } // Pull subvector inserts into undef through VZEXT_MOVL by making it an // insert into a zero vector. This helps get VZEXT_MOVL closer to // scalar_to_vectors where 256/512 are canonicalized to an insert and a // 128-bit scalar_to_vector. This reduces the number of isel patterns. if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) { SDValue V = peekThroughOneUseBitcasts(N0); if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) { SDValue In = V.getOperand(1); MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), In.getValueSizeInBits() / VT.getScalarSizeInBits()); In = DAG.getBitcast(SubVT, In); SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), Movl, V.getOperand(2)); } } return SDValue(); } case X86ISD::BLENDI: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); unsigned EltBits = VT.getScalarSizeInBits(); if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. // TODO: Handle MVT::v16i16 repeated blend mask. if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { MVT SrcVT = N0.getOperand(0).getSimpleValueType(); unsigned SrcBits = SrcVT.getScalarSizeInBits(); if ((EltBits % SrcBits) == 0 && SrcBits >= 32) { unsigned Size = VT.getVectorNumElements(); unsigned NewSize = SrcVT.getVectorNumElements(); APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size); APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), N1.getOperand(0), DAG.getTargetConstant(NewBlendMask.getZExtValue(), DL, MVT::i8))); } } // Share PSHUFB masks: // blend(pshufb(x,m1),pshufb(y,m2)) // --> m3 = blend(m1,m2) // blend(pshufb(x,m3),pshufb(y,m3)) if (N0.hasOneUse() && N1.hasOneUse()) { SmallVector Mask, ByteMask; SmallVector Ops; SDValue LHS = peekThroughOneUseBitcasts(N0); SDValue RHS = peekThroughOneUseBitcasts(N1); if (LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB && LHS.getOperand(1) != RHS.getOperand(1) && LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() && getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) { assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) && RHS == peekThroughOneUseBitcasts(Ops[1]) && "BLENDI decode mismatch"); MVT ShufVT = LHS.getSimpleValueType(); SDValue MaskLHS = LHS.getOperand(1); SDValue MaskRHS = RHS.getOperand(1); llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask); if (SDValue NewMask = combineX86ShufflesConstants( ShufVT, {MaskLHS, MaskRHS}, ByteMask, /*HasVariableMask=*/true, DAG, DL, Subtarget)) { SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, LHS.getOperand(0), NewMask); SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, RHS.getOperand(0), NewMask); return DAG.getNode(X86ISD::BLENDI, DL, VT, DAG.getBitcast(VT, NewLHS), DAG.getBitcast(VT, NewRHS), N.getOperand(2)); } } } } return SDValue(); } case X86ISD::SHUFP: { // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y). // This is a more relaxed shuffle combiner that can ignore oneuse limits. // TODO: Support types other than v4f32. if (VT == MVT::v4f32) { bool Updated = false; SmallVector Mask; SmallVector Ops; if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) { for (int i = 0; i != 2; ++i) { SmallVector SubOps; SmallVector SubMask, SubScaledMask; SDValue Sub = peekThroughBitcasts(Ops[i]); // TODO: Scaling might be easier if we specify the demanded elts. if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) && scaleShuffleElements(SubMask, 4, SubScaledMask) && SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) { int Ofs = i * 2; Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4); Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4); Ops[i] = DAG.getBitcast(VT, SubOps[0]); Updated = true; } } } if (Updated) { for (int &M : Mask) M %= 4; Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops); } } return SDValue(); } case X86ISD::VPERMI: { // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. // TODO: Remove when we have preferred domains in combineX86ShuffleChain. SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (N0.getOpcode() == ISD::BITCAST && N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) { SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1); return DAG.getBitcast(VT, Res); } return SDValue(); } case X86ISD::SHUF128: { // If we're permuting the upper 256-bits subvectors of a concatenation, then // see if we can peek through and access the subvector directly. if (VT.is512BitVector()) { // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the // upper subvector is used. SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); uint64_t Mask = N->getConstantOperandVal(2); SmallVector LHSOps, RHSOps; SDValue NewLHS, NewRHS; if ((Mask & 0x0A) == 0x0A && collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) { NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512); Mask &= ~0x0A; } if ((Mask & 0xA0) == 0xA0 && collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) { NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512); Mask &= ~0xA0; } if (NewLHS || NewRHS) return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS, NewRHS ? NewRHS : RHS, DAG.getTargetConstant(Mask, DL, MVT::i8)); } return SDValue(); } case X86ISD::VPERM2X128: { // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)). SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS.getOpcode() == ISD::BITCAST && (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) { EVT SrcVT = LHS.getOperand(0).getValueType(); if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) { return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT, DAG.getBitcast(SrcVT, LHS), DAG.getBitcast(SrcVT, RHS), N->getOperand(2))); } } // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()). if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) return Res; // Fold vperm2x128 subvector shuffle with an inner concat pattern. // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. auto FindSubVector128 = [&](unsigned Idx) { if (Idx > 3) return SDValue(); SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1)); SmallVector SubOps; if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2) return SubOps[Idx & 1]; unsigned NumElts = Src.getValueType().getVectorNumElements(); if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(1).getValueSizeInBits() == 128 && Src.getConstantOperandAPInt(2) == (NumElts / 2)) { return Src.getOperand(1); } return SDValue(); }; unsigned Imm = N.getConstantOperandVal(2); if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) { if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) { MVT SubVT = VT.getHalfNumVectorElementsVT(); SubLo = DAG.getBitcast(SubVT, SubLo); SubHi = DAG.getBitcast(SubVT, SubHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi); } } return SDValue(); } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); if (N0->hasOneUse()) { SDValue V = peekThroughOneUseBitcasts(N0); switch (V.getOpcode()) { case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: case X86ISD::VROTLI: case X86ISD::VROTRI: { MVT InnerVT = V.getSimpleValueType(); if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) { SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(VT, V.getOperand(0)), N1); Res = DAG.getBitcast(InnerVT, Res); Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1)); return DAG.getBitcast(VT, Res); } break; } } } Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; } case X86ISD::MOVSD: case X86ISD::MOVSH: case X86ISD::MOVSS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); // Canonicalize scalar FPOps: // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0]))) // If commutable, allow OP(N1[0], N0[0]). unsigned Opcode1 = N1.getOpcode(); if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB || Opcode1 == ISD::FDIV) { SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); if (N10 == N0 || (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) { if (N10 != N0) std::swap(N10, N11); MVT SVT = VT.getVectorElementType(); SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx); N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx); SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11); SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); return DAG.getNode(Opcode, DL, VT, N0, SclVec); } } return SDValue(); } case X86ISD::INSERTPS: { assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); unsigned InsertPSMask = N.getConstantOperandVal(2); unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; // If we zero out all elements from Op0 then we don't need to reference it. if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); // If we zero out the element from Op1 then we don't need to reference it. if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector TargetMask1; SmallVector Ops1; APInt KnownUndef1, KnownZero1; if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1, KnownZero1)) { if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) { // Zero/UNDEF insertion - zero out element and remove dependency. InsertPSMask |= (1u << DstIdx); return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Update insertps mask srcidx and reference the source input directly. int M = TargetMask1[SrcIdx]; assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; SmallVector Ops0; APInt KnownUndef0, KnownZero0; if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0, KnownZero0)) { bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; } if (KnownUndef0[i] || KnownZero0[i]) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; continue; } // The input vector element must be inline. int M = TargetMask0[i]; if (M != i && M != (i + 4)) return SDValue(); // Determine which inputs of the target shuffle we're using. UseInput00 |= (0 <= M && M < 4); UseInput01 |= (4 <= M); } // If we're not using both inputs of the target shuffle then use the // referenced input directly. if (UseInput00 && !UseInput01) { Updated = true; Op0 = Ops0[0]; } else if (!UseInput00 && UseInput01) { Updated = true; Op0 = Ops0[1]; } if (Updated) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // If we're inserting an element from a vbroadcast load, fold the // load into the X86insertps instruction. We need to convert the scalar // load to a vector and clear the source lane of the INSERTPS control. if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { auto *MemIntr = cast(Op1); if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), MemIntr->getBasePtr(), MemIntr->getMemOperand()); SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Load), DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); return Insert; } } return SDValue(); } case X86ISD::VPERMV3: { // Combine VPERMV3 to widened VPERMV if the two source operands are split // from the same vector. SDValue V1 = peekThroughBitcasts(N.getOperand(0)); SDValue V2 = peekThroughBitcasts(N.getOperand(2)); MVT SVT = V1.getSimpleValueType(); if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR && V1.getConstantOperandVal(1) == 0 && V2.getOpcode() == ISD::EXTRACT_SUBVECTOR && V2.getConstantOperandVal(1) == SVT.getVectorNumElements() && V1.getOperand(0) == V2.getOperand(0)) { EVT NVT = V1.getOperand(0).getValueType(); if (NVT.is256BitVector() || (NVT.is512BitVector() && Subtarget.hasEVEX512())) { MVT WideVT = MVT::getVectorVT( VT.getScalarType(), NVT.getSizeInBits() / VT.getScalarSizeInBits()); SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, DL, WideVT.getSizeInBits()); SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, DAG.getBitcast(WideVT, V1.getOperand(0))); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, DAG.getIntPtrConstant(0, DL)); } } return SDValue(); } default: return SDValue(); } // Nuke no-op shuffles that show up after combining. if (isNoopShuffleMask(Mask)) return N.getOperand(0); // Look for simplifications involving one or two shuffle instructions. SDValue V = N.getOperand(0); switch (N.getOpcode()) { default: break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); // See if this reduces to a PSHUFD which is no more expensive and can // combine with more operations. Note that it has to at least flip the // dwords as otherwise it would have been removed as a no-op. if (ArrayRef(Mask).equals({2, 3, 0, 1})) { int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; DMask[DOffset + 0] = DOffset + 1; DMask[DOffset + 1] = DOffset + 0; MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); V = DAG.getBitcast(DVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); return DAG.getBitcast(VT, V); } // Look for shuffle patterns which can be implemented as a single unpack. // FIXME: This doesn't handle the location of the PSHUFD generically, and // only works when we have a PSHUFD followed by two half-shuffles. if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && V.hasOneUse() && V.getOperand(0).hasOneUse()) { SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); if (D.getOpcode() == X86ISD::PSHUFD) { SmallVector VMask = getPSHUFShuffleMask(V); SmallVector DMask = getPSHUFShuffleMask(D); int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; int WordMask[8]; for (int i = 0; i < 4; ++i) { WordMask[i + NOffset] = Mask[i] + NOffset; WordMask[i + VOffset] = VMask[i] + VOffset; } // Map the word mask through the DWord mask. int MappedMask[8]; for (int i = 0; i < 8; ++i) MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { // We can replace all three shuffles with an unpack. V = DAG.getBitcast(VT, D.getOperand(0)); return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, VT, V, V); } } } break; case X86ISD::PSHUFD: if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG)) return NewN; break; } return SDValue(); } /// Checks if the shuffle mask takes subsequent elements /// alternately from two vectors. /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct. static bool isAddSubOrSubAddMask(ArrayRef Mask, bool &Op0Even) { int ParitySrc[2] = {-1, -1}; unsigned Size = Mask.size(); for (unsigned i = 0; i != Size; ++i) { int M = Mask[i]; if (M < 0) continue; // Make sure we are using the matching element from the input. if ((M % Size) != i) return false; // Make sure we use the same input for all elements of the same parity. int Src = M / Size; if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src) return false; ParitySrc[i % 2] = Src; } // Make sure each input is used. if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1]) return false; Op0Even = ParitySrc[0] == 0; return true; } /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation /// are written to the parameters \p Opnd0 and \p Opnd1. /// /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes /// so it is easier to generically match. We also insert dummy vector shuffle /// nodes for the operands which explicitly discard the lanes which are unused /// by this operation to try to flow through the rest of the combiner /// the fact that they're unused. static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, bool &IsSubAdd) { EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) || !VT.getSimpleVT().isFloatingPoint()) return false; // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return false; SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); // Make sure we have an FADD and an FSUB. if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) || (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) || V1.getOpcode() == V2.getOpcode()) return false; // If there are other uses of these operations we can't fold them. if (!V1->hasOneUse() || !V2->hasOneUse()) return false; // Ensure that both operations have the same operands. Note that we can // commute the FADD operands. SDValue LHS, RHS; if (V1.getOpcode() == ISD::FSUB) { LHS = V1->getOperand(0); RHS = V1->getOperand(1); if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) return false; } else { assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode"); LHS = V2->getOperand(0); RHS = V2->getOperand(1); if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) && (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS)) return false; } ArrayRef Mask = cast(N)->getMask(); bool Op0Even; if (!isAddSubOrSubAddMask(Mask, Op0Even)) return false; // It's a subadd if the vector in the even parity is an FADD. IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD; Opnd0 = LHS; Opnd1 = RHS; return true; } /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd. static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We only handle target-independent shuffles. // FIXME: It would be easy and harmless to use the target shuffle mask // extraction tool to support more. if (N->getOpcode() != ISD::VECTOR_SHUFFLE) return SDValue(); MVT VT = N->getSimpleValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT)) return SDValue(); // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c). SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue FMAdd = Op0, FMSub = Op1; if (FMSub.getOpcode() != X86ISD::FMSUB) std::swap(FMAdd, FMSub); if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB || FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() || FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() || FMAdd.getOperand(2) != FMSub.getOperand(2)) return SDValue(); // Check for correct shuffle mask. ArrayRef Mask = cast(N)->getMask(); bool Op0Even; if (!isAddSubOrSubAddMask(Mask, Op0Even)) return SDValue(); // FMAddSub takes zeroth operand from FMSub node. bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd; unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1), FMAdd.getOperand(2)); } /// Try to combine a shuffle into a target-specific add-sub or /// mul-add-sub node. static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG)) return V; SDValue Opnd0, Opnd1; bool IsSubAdd; if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd)) return SDValue(); MVT VT = N->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } if (IsSubAdd) return SDValue(); // Do not generate X86ISD::ADDSUB node for 512-bit types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with 512-bit ADDSUB instructions! if (VT.is512BitVector()) return SDValue(); // Do not generate X86ISD::ADDSUB node for FP16's vector types even though // the ADDSUB idiom has been successfully recognized. There are no known // X86 targets with FP16 ADDSUB instructions! if (VT.getVectorElementType() == MVT::f16) return SDValue(); return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX2() || !isa(N)) return SDValue(); EVT VT = N->getValueType(0); // We only care about shuffles of 128/256-bit vectors of 32/64-bit values. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); if (VT.getVectorElementType() != MVT::i32 && VT.getVectorElementType() != MVT::i64 && VT.getVectorElementType() != MVT::f32 && VT.getVectorElementType() != MVT::f64) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Check that both sources are concats with undef. if (N0.getOpcode() != ISD::CONCAT_VECTORS || N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef()) return SDValue(); // Construct the new shuffle mask. Elements from the first source retain their // index, but elements from the second source no longer need to skip an undef. SmallVector Mask; int NumElts = VT.getVectorNumElements(); auto *SVOp = cast(N); for (int Elt : SVOp->getMask()) Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2)); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0), N1.getOperand(0)); return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the /// low half of each source vector and does not set any high half elements in /// the destination vector, narrow the shuffle to half its original size. static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { EVT VT = Shuf->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0))) return SDValue(); if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // See if we can ignore all of the high elements of the shuffle. ArrayRef Mask = Shuf->getMask(); if (!isUndefUpperHalf(Mask)) return SDValue(); // Check if the shuffle mask accesses only the low half of each input vector // (half-index output is 0 or 2). int HalfIdx1, HalfIdx2; SmallVector HalfMask(Mask.size() / 2); if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) || (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1)) return SDValue(); // Create a half-width shuffle to replace the unnecessarily wide shuffle. // The trick is knowing that all of the insert/extract are actually free // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle // of narrow inputs into a narrow output, and that is always cheaper than // the wide shuffle that we started with. return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), Shuf->getOperand(1), HalfMask, HalfIdx1, HalfIdx2, false, DAG, /*UseConcat*/ true); } static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (auto *Shuf = dyn_cast(N)) if (SDValue V = narrowShuffle(Shuf, DAG)) return V; // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget)) if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG)) return AddSub; // Attempt to combine into a vector load/broadcast. if (SDValue LD = combineToConsecutiveLoads( VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true)) return LD; // For AVX2, we sometimes want to combine // (vector_shuffle (concat_vectors t1, undef) // (concat_vectors t2, undef)) // Into: // (vector_shuffle (concat_vectors t1, t2), undef) // Since the latter can be efficiently lowered with VPERMD/VPERMQ if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget)) return ShufConcat; if (isTargetShuffle(N->getOpcode())) { SDValue Op(N, 0); if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget)) return Shuffle; // Try recursively combining arbitrary sequences of x86 shuffle // instructions into higher-order shuffles. We do this after combining // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // Simplify source operands based on shuffle mask. // TODO - merge this into combineX86ShufflesRecursively. APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI)) return SDValue(N, 0); // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)). // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). // Perform this after other shuffle combines to allow inner shuffles to be // combined away first. if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl)) return BinOp; } return SDValue(); } // Simplify variable target shuffle masks based on the demanded elements. // TODO: Handle DemandedBits in mask indices as well? bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle( SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const { // If we're demanding all elements don't bother trying to simplify the mask. unsigned NumElts = DemandedElts.getBitWidth(); if (DemandedElts.isAllOnes()) return false; SDValue Mask = Op.getOperand(MaskIndex); if (!Mask.hasOneUse()) return false; // Attempt to generically simplify the variable shuffle mask. APInt MaskUndef, MaskZero; if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, Depth + 1)) return true; // Attempt to extract+simplify a (constant pool load) shuffle mask. // TODO: Support other types from getTargetShuffleMaskIndices? SDValue BC = peekThroughOneUseBitcasts(Mask); EVT BCVT = BC.getValueType(); auto *Load = dyn_cast(BC); if (!Load || !Load->getBasePtr().hasOneUse()) return false; const Constant *C = getTargetConstantFromNode(Load); if (!C) return false; Type *CTy = C->getType(); if (!CTy->isVectorTy() || CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits()) return false; // Handle scaling for i64 elements on 32-bit targets. unsigned NumCstElts = cast(CTy)->getNumElements(); if (NumCstElts != NumElts && NumCstElts != (NumElts * 2)) return false; unsigned Scale = NumCstElts / NumElts; // Simplify mask if we have an undemanded element that is not undef. bool Simplified = false; SmallVector ConstVecOps; for (unsigned i = 0; i != NumCstElts; ++i) { Constant *Elt = C->getAggregateElement(i); if (!DemandedElts[i / Scale] && !isa(Elt)) { ConstVecOps.push_back(UndefValue::get(Elt->getType())); Simplified = true; continue; } ConstVecOps.push_back(Elt); } if (!Simplified) return false; // Generate new constant pool entry + legalize immediately for the load. SDLoc DL(Op); SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT); SDValue LegalCV = LowerConstantPool(CV, TLO.DAG); SDValue NewMask = TLO.DAG.getLoad( BCVT, DL, TLO.DAG.getEntryNode(), LegalCV, MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()), Load->getAlign()); return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask)); } bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const { int NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); // Handle special case opcodes. switch (Opc) { case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; // Multiply by zero. KnownZero = LHSZero | RHSZero; break; } case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: { APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts); if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; // TODO: Multiply by zero. // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent. APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero; if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero; if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; break; } case X86ISD::PSADBW: { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); assert(VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"); // Aggressively peek through ops to get at the demanded elts. if (!DemandedElts.isAllOnes()) { unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts( LHS, DemandedSrcElts, TLO.DAG, Depth + 1); SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts( RHS, DemandedSrcElts, TLO.DAG, Depth + 1); if (NewLHS || NewRHS) { NewLHS = NewLHS ? NewLHS : LHS; NewRHS = NewRHS ? NewRHS : RHS; return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS)); } } break; } case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: { // We only need the bottom 64-bits of the (128-bit) shift amount. SDValue Amt = Op.getOperand(1); MVT AmtVT = Amt.getSimpleValueType(); assert(AmtVT.is128BitVector() && "Unexpected value type"); // If we reuse the shift amount just for sse shift amounts then we know that // only the bottom 64-bits are only ever used. bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { unsigned UseOpc = Use->getOpcode(); return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL || UseOpc == X86ISD::VSRA) && Use->getOperand(0) != Amt; }); APInt AmtUndef, AmtZero; unsigned NumAmtElts = AmtVT.getVectorNumElements(); APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, Depth + 1, AssumeSingleUse)) return true; [[fallthrough]]; } case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); APInt SrcUndef; if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO, Depth + 1)) return true; // Fold shift(0,x) -> 0 if (DemandedElts.isSubsetOf(KnownZero)) return TLO.CombineTo( Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); // Aggressively peek through ops to get at the demanded elts. if (!DemandedElts.isAllOnes()) if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( Src, DemandedElts, TLO.DAG, Depth + 1)) return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1))); break; } case X86ISD::VPSHA: case X86ISD::VPSHL: case X86ISD::VSHLV: case X86ISD::VSRLV: case X86ISD::VSRAV: { APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; // Fold shift(0,x) -> 0 if (DemandedElts.isSubsetOf(LHSZero)) return TLO.CombineTo( Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; KnownZero = LHSZero; break; } case X86ISD::PCMPEQ: case X86ISD::PCMPGT: { APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; break; } case X86ISD::KSHIFTL: { SDValue Src = Op.getOperand(0); auto *Amt = cast(Op.getOperand(1)); assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"); unsigned ShiftAmt = Amt->getZExtValue(); if (ShiftAmt == 0) return TLO.CombineTo(Op, Src); // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a // single shift. We can do this if the bottom bits (which are shifted // out) are never demanded. if (Src.getOpcode() == X86ISD::KSHIFTR) { if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) { unsigned C1 = Src.getConstantOperandVal(1); unsigned NewOpc = X86ISD::KSHIFTL; int Diff = ShiftAmt - C1; if (Diff < 0) { Diff = -Diff; NewOpc = X86ISD::KSHIFTR; } SDLoc dl(Op); SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); return TLO.CombineTo( Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); } } APInt DemandedSrc = DemandedElts.lshr(ShiftAmt); if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, Depth + 1)) return true; KnownUndef <<= ShiftAmt; KnownZero <<= ShiftAmt; KnownZero.setLowBits(ShiftAmt); break; } case X86ISD::KSHIFTR: { SDValue Src = Op.getOperand(0); auto *Amt = cast(Op.getOperand(1)); assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"); unsigned ShiftAmt = Amt->getZExtValue(); if (ShiftAmt == 0) return TLO.CombineTo(Op, Src); // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a // single shift. We can do this if the top bits (which are shifted // out) are never demanded. if (Src.getOpcode() == X86ISD::KSHIFTL) { if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) { unsigned C1 = Src.getConstantOperandVal(1); unsigned NewOpc = X86ISD::KSHIFTR; int Diff = ShiftAmt - C1; if (Diff < 0) { Diff = -Diff; NewOpc = X86ISD::KSHIFTL; } SDLoc dl(Op); SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); return TLO.CombineTo( Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); } } APInt DemandedSrc = DemandedElts.shl(ShiftAmt); if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, Depth + 1)) return true; KnownUndef.lshrInPlace(ShiftAmt); KnownZero.lshrInPlace(ShiftAmt); KnownZero.setHighBits(ShiftAmt); break; } case X86ISD::ANDNP: { // ANDNP = (~LHS & RHS); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { APInt UndefElts; SmallVector EltBits; int NumElts = VT.getVectorNumElements(); int EltSizeInBits = VT.getScalarSizeInBits(); APInt OpBits = APInt::getAllOnes(EltSizeInBits); APInt OpElts = DemandedElts; if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) { OpBits.clearAllBits(); OpElts.clearAllBits(); for (int I = 0; I != NumElts; ++I) { if (!DemandedElts[I]) continue; if (UndefElts[I]) { // We can't assume an undef src element gives an undef dst - the // other src might be zero. OpBits.setAllBits(); OpElts.setBit(I); } else if ((Invert && !EltBits[I].isAllOnes()) || (!Invert && !EltBits[I].isZero())) { OpBits |= Invert ? ~EltBits[I] : EltBits[I]; OpElts.setBit(I); } } } return std::make_pair(OpBits, OpElts); }; APInt BitsLHS, EltsLHS; APInt BitsRHS, EltsRHS; std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS); std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true); APInt LHSUndef, LHSZero; APInt RHSUndef, RHSZero; if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO, Depth + 1)) return true; if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO, Depth + 1)) return true; if (!DemandedElts.isAllOnes()) { SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS, TLO.DAG, Depth + 1); SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS, TLO.DAG, Depth + 1); if (NewLHS || NewRHS) { NewLHS = NewLHS ? NewLHS : LHS; NewRHS = NewRHS ? NewRHS : RHS; return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS)); } } break; } case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: case X86ISD::CVTPH2PS: case X86ISD::CVTPS2PH: { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); APInt SrcUndef, SrcZero; APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; break; } case X86ISD::PACKSS: case X86ISD::PACKUS: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt RHSUndef, RHSZero; if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO, Depth + 1)) return true; // TODO - pass on known zero/undef. // Aggressively peek through ops to get at the demanded elts. // TODO - we should do this for all target/faux shuffles ops. if (!DemandedElts.isAllOnes()) { SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS, TLO.DAG, Depth + 1); SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS, TLO.DAG, Depth + 1); if (NewN0 || NewN1) { NewN0 = NewN0 ? NewN0 : N0; NewN1 = NewN1 ? NewN1 : N1; return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1)); } } break; } case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); APInt DemandedLHS, DemandedRHS; getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt RHSUndef, RHSZero; if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO, Depth + 1)) return true; // TODO - pass on known zero/undef. // Aggressively peek through ops to get at the demanded elts. // TODO: Handle repeated operands. if (N0 != N1 && !DemandedElts.isAllOnes()) { SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS, TLO.DAG, Depth + 1); SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS, TLO.DAG, Depth + 1); if (NewN0 || NewN1) { NewN0 = NewN0 ? NewN0 : N0; NewN1 = NewN1 ? NewN1 : N1; return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1)); } } break; } case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, Depth + 1)) return true; KnownZero = SrcZero.zextOrTrunc(NumElts); KnownUndef = SrcUndef.zextOrTrunc(NumElts); break; } case X86ISD::BLENDI: { SmallVector BlendMask; DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask); if (SDValue R = combineBlendOfPermutes( VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask, DemandedElts, TLO.DAG, Subtarget, SDLoc(Op))) return TLO.CombineTo(Op, R); break; } case X86ISD::BLENDV: { APInt SelUndef, SelZero; if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, SelZero, TLO, Depth + 1)) return true; // TODO: Use SelZero to adjust LHS/RHS DemandedElts. APInt LHSUndef, LHSZero; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, LHSZero, TLO, Depth + 1)) return true; APInt RHSUndef, RHSZero; if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, RHSZero, TLO, Depth + 1)) return true; KnownZero = LHSZero & RHSZero; KnownUndef = LHSUndef & RHSUndef; break; } case X86ISD::VZEXT_MOVL: { // If upper demanded elements are already zero then we have nothing to do. SDValue Src = Op.getOperand(0); APInt DemandedUpperElts = DemandedElts; DemandedUpperElts.clearLowBits(1); if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1)) return TLO.CombineTo(Op, Src); break; } case X86ISD::VZEXT_LOAD: { // If upper demanded elements are not demanded then simplify to a // scalar_to_vector(load()). MVT SVT = VT.getSimpleVT().getVectorElementType(); if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) { SDLoc DL(Op); auto *Mem = cast(Op); SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(), Mem->getMemOperand()); SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt); return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec)); } break; } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); if (!SrcVT.isVector()) break; // Don't bother broadcasting if we just need the 0'th element. if (DemandedElts == 1) { if (Src.getValueType() != VT) Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, SDLoc(Op)); return TLO.CombineTo(Op, Src); } APInt SrcUndef, SrcZero; APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; // Aggressively peek through src to get at the demanded elt. // TODO - we should do this for all target/faux shuffles ops. if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( Src, SrcElts, TLO.DAG, Depth + 1)) return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); break; } case X86ISD::VPERMV: if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO, Depth)) return true; break; case X86ISD::PSHUFB: case X86ISD::VPERMV3: case X86ISD::VPERMILPV: if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO, Depth)) return true; break; case X86ISD::VPPERM: case X86ISD::VPERMIL2: if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO, Depth)) return true; break; } // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not // demand any of the high elements, then narrow the op to 128/256-bits: e.g. // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 if ((VT.is256BitVector() || VT.is512BitVector()) && DemandedElts.lshr(NumElts / 2) == 0) { unsigned SizeInBits = VT.getSizeInBits(); unsigned ExtSizeInBits = SizeInBits / 2; // See if 512-bit ops only use the bottom 128-bits. if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) ExtSizeInBits = SizeInBits / 4; switch (Opc) { // Scalar broadcast. case X86ISD::VBROADCAST: { SDLoc DL(Op); SDValue Src = Op.getOperand(0); if (Src.getValueSizeInBits() > ExtSizeInBits) Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), ExtSizeInBits / VT.getScalarSizeInBits()); SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src); return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, TLO.DAG, DL, ExtSizeInBits)); } case X86ISD::VBROADCAST_LOAD: { SDLoc DL(Op); auto *MemIntr = cast(Op); EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), ExtSizeInBits / VT.getScalarSizeInBits()); SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; SDValue Bcst = TLO.DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Bcst.getValue(1)); return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, TLO.DAG, DL, ExtSizeInBits)); } // Subvector broadcast. case X86ISD::SUBV_BROADCAST_LOAD: { auto *MemIntr = cast(Op); EVT MemVT = MemIntr->getMemoryVT(); if (ExtSizeInBits == MemVT.getStoreSizeInBits()) { SDLoc DL(Op); SDValue Ld = TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(), MemIntr->getMemOperand()); TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1)); return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0, TLO.DAG, DL, ExtSizeInBits)); } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) { SDLoc DL(Op); EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), ExtSizeInBits / VT.getScalarSizeInBits()); if (SDValue BcstLd = getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG)) return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0, TLO.DAG, DL, ExtSizeInBits)); } break; } // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: // Shift by uniform. case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: // Shift by immediate. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: { SDLoc DL(Op); SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1)); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } case X86ISD::VPERMI: { // Simplify PERMPD/PERMQ to extract_subvector. // TODO: This should be done in shuffle combining. if (VT == MVT::v4f64 || VT == MVT::v4i64) { SmallVector Mask; DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask); if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) { SDLoc DL(Op); SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128); return TLO.CombineTo(Op, Insert); } } break; } case X86ISD::VPERM2X128: { // Simplify VPERM2F128/VPERM2I128 to extract_subvector. SDLoc DL(Op); unsigned LoMask = Op.getConstantOperandVal(2) & 0xF; if (LoMask & 0x8) return TLO.CombineTo( Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL)); unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2); unsigned SrcIdx = (LoMask & 0x2) >> 1; SDValue ExtOp = extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } // Zero upper elements. case X86ISD::VZEXT_MOVL: // Target unary shuffles by immediate: case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: case X86ISD::VPERMILPI: // (Non-Lane Crossing) Target Shuffles. case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::PSHUFB: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::BLENDI: // Integer ops. case X86ISD::PACKSS: case X86ISD::PACKUS: case X86ISD::PCMPEQ: case X86ISD::PCMPGT: case X86ISD::PMULUDQ: case X86ISD::PMULDQ: case X86ISD::VSHLV: case X86ISD::VSRLV: case X86ISD::VSRAV: // Float ops. case X86ISD::FMAX: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMINC: // Horizontal Ops. case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: { SDLoc DL(Op); SmallVector Ops; for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { SDValue SrcOp = Op.getOperand(i); EVT SrcVT = SrcOp.getValueType(); assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && "Unsupported vector size"); Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL, ExtSizeInBits) : SrcOp); } MVT ExtVT = VT.getSimpleVT(); ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), ExtSizeInBits / ExtVT.getScalarSizeInBits()); SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } } } // For splats, unless we *only* demand the 0'th element, // stop attempts at simplification here, we aren't going to improve things, // this is better than any potential shuffle. if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false)) return false; // Get target/faux shuffle mask. APInt OpUndef, OpZero; SmallVector OpMask; SmallVector OpInputs; if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef, OpZero, TLO.DAG, Depth, false)) return false; // Shuffle inputs must be the same size as the result. if (OpMask.size() != (unsigned)NumElts || llvm::any_of(OpInputs, [VT](SDValue V) { return VT.getSizeInBits() != V.getValueSizeInBits() || !V.getValueType().isVector(); })) return false; KnownZero = OpZero; KnownUndef = OpUndef; // Check if shuffle mask can be simplified to undef/zero/identity. int NumSrcs = OpInputs.size(); for (int i = 0; i != NumElts; ++i) if (!DemandedElts[i]) OpMask[i] = SM_SentinelUndef; if (isUndefInRange(OpMask, 0, NumElts)) { KnownUndef.setAllBits(); return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); } if (isUndefOrZeroInRange(OpMask, 0, NumElts)) { KnownZero.setAllBits(); return TLO.CombineTo( Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); } for (int Src = 0; Src != NumSrcs; ++Src) if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src])); // Attempt to simplify inputs. for (int Src = 0; Src != NumSrcs; ++Src) { // TODO: Support inputs of different types. if (OpInputs[Src].getValueType() != VT) continue; int Lo = Src * NumElts; APInt SrcElts = APInt::getZero(NumElts); for (int i = 0; i != NumElts; ++i) if (DemandedElts[i]) { int M = OpMask[i] - Lo; if (0 <= M && M < NumElts) SrcElts.setBit(M); } // TODO - Propagate input undef/zero elts. APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; } // If we don't demand all elements, then attempt to combine to a simpler // shuffle. // We need to convert the depth to something combineX86ShufflesRecursively // can handle - so pretend its Depth == 0 again, and reduce the max depth // to match. This prevents combineX86ShuffleChain from returning a // combined shuffle that's the same as the original root, causing an // infinite loop. if (!DemandedElts.isAllOnes()) { assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range"); SmallVector DemandedMask(NumElts, SM_SentinelUndef); for (int i = 0; i != NumElts; ++i) if (DemandedElts[i]) DemandedMask[i] = i; SDValue NewShuffle = combineX86ShufflesRecursively( {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) return TLO.CombineTo(Op, NewShuffle); } return false; } bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const { EVT VT = Op.getValueType(); unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { case X86ISD::VTRUNC: { KnownBits KnownOp; SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); // Simplify the input, using demanded bit information. APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits()); APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements()); if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1)) return true; break; } case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. KnownBits KnownLHS, KnownRHS; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast. // FIXME: Can we bound this better? APInt DemandedMask = APInt::getLowBitsSet(64, 32); APInt DemandedMaskLHS = APInt::getAllOnes(64); APInt DemandedMaskRHS = APInt::getAllOnes(64); bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512(); if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS)) DemandedMaskLHS = DemandedMask; if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS)) DemandedMaskRHS = DemandedMask; if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts, KnownLHS, TLO, Depth + 1)) return true; if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts, KnownRHS, TLO, Depth + 1)) return true; // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'. KnownRHS = KnownRHS.trunc(32); if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() && KnownRHS.getConstant().isOne()) { SDLoc DL(Op); SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask)); } // Aggressively peek through ops to get at the demanded low bits. SDValue DemandedLHS = SimplifyMultipleUseDemandedBits( LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1); SDValue DemandedRHS = SimplifyMultipleUseDemandedBits( RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1); if (DemandedLHS || DemandedRHS) { DemandedLHS = DemandedLHS ? DemandedLHS : LHS; DemandedRHS = DemandedRHS ? DemandedRHS : RHS; return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS)); } break; } case X86ISD::ANDNP: { KnownBits Known2; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits, OriginalDemandedElts, Known2, TLO, Depth + 1)) return true; // If the RHS is a constant, see if we can simplify it. if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits, OriginalDemandedElts, TLO)) return true; // ANDNP = (~Op0 & Op1); Known.One &= Known2.Zero; Known.Zero |= Known2.One; break; } case X86ISD::VSHLI: { SDValue Op0 = Op.getOperand(0); unsigned ShAmt = Op.getConstantOperandVal(1); if (ShAmt >= BitWidth) break; APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a // single shift. We can do this if the bottom bits (which are shifted // out) are never demanded. if (Op0.getOpcode() == X86ISD::VSRLI && OriginalDemandedBits.countr_zero() >= ShAmt) { unsigned Shift2Amt = Op0.getConstantOperandVal(1); if (Shift2Amt < BitWidth) { int Diff = ShAmt - Shift2Amt; if (Diff == 0) return TLO.CombineTo(Op, Op0.getOperand(0)); unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; SDValue NewShift = TLO.DAG.getNode( NewOpc, SDLoc(Op), VT, Op0.getOperand(0), TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); return TLO.CombineTo(Op, NewShift); } } // If we are only demanding sign bits then we can use the shift source directly. unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1); unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero(); if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) return TLO.CombineTo(Op, Op0); if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; Known.Zero <<= ShAmt; Known.One <<= ShAmt; // Low bits known zero. Known.Zero.setLowBits(ShAmt); return false; } case X86ISD::VSRLI: { unsigned ShAmt = Op.getConstantOperandVal(1); if (ShAmt >= BitWidth) break; APInt DemandedMask = OriginalDemandedBits << ShAmt; if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // High bits known zero. Known.Zero.setHighBits(ShAmt); return false; } case X86ISD::VSRAI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); unsigned ShAmt = Op1->getAsZExtVal(); if (ShAmt >= BitWidth) break; APInt DemandedMask = OriginalDemandedBits << ShAmt; // If we just want the sign bit then we don't need to shift it. if (OriginalDemandedBits.isSignMask()) return TLO.CombineTo(Op, Op0); // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 if (Op0.getOpcode() == X86ISD::VSHLI && Op.getOperand(1) == Op0.getOperand(1)) { SDValue Op00 = Op0.getOperand(0); unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); if (ShAmt < NumSignBits) return TLO.CombineTo(Op, Op00); } // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. if (OriginalDemandedBits.countl_zero() < ShAmt) DemandedMask.setSignBit(); if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; Known.Zero.lshrInPlace(ShAmt); Known.One.lshrInPlace(ShAmt); // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. if (Known.Zero[BitWidth - ShAmt - 1] || OriginalDemandedBits.countl_zero() >= ShAmt) return TLO.CombineTo( Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); // High bits are known one. if (Known.One[BitWidth - ShAmt - 1]) Known.One.setHighBits(ShAmt); return false; } case X86ISD::BLENDV: { SDValue Sel = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); APInt SignMask = APInt::getSignMask(BitWidth); SDValue NewSel = SimplifyMultipleUseDemandedBits( Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1); SDValue NewLHS = SimplifyMultipleUseDemandedBits( LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1); SDValue NewRHS = SimplifyMultipleUseDemandedBits( RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1); if (NewSel || NewLHS || NewRHS) { NewSel = NewSel ? NewSel : Sel; NewLHS = NewLHS ? NewLHS : LHS; NewRHS = NewRHS ? NewRHS : RHS; return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT, NewSel, NewLHS, NewRHS)); } break; } case X86ISD::PEXTRB: case X86ISD::PEXTRW: { SDValue Vec = Op.getOperand(0); auto *CIdx = dyn_cast(Op.getOperand(1)); MVT VecVT = Vec.getSimpleValueType(); unsigned NumVecElts = VecVT.getVectorNumElements(); if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) { unsigned Idx = CIdx->getZExtValue(); unsigned VecBitWidth = VecVT.getScalarSizeInBits(); // If we demand no bits from the vector then we must have demanded // bits from the implict zext - simplify to zero. APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth); if (DemandedVecBits == 0) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); APInt KnownUndef, KnownZero; APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; KnownBits KnownVec; if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec, TLO, Depth + 1)) return true; if (SDValue V = SimplifyMultipleUseDemandedBits( Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1)) return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1))); Known = KnownVec.zext(BitWidth); return false; } break; } case X86ISD::PINSRB: case X86ISD::PINSRW: { SDValue Vec = Op.getOperand(0); SDValue Scl = Op.getOperand(1); auto *CIdx = dyn_cast(Op.getOperand(2)); MVT VecVT = Vec.getSimpleValueType(); if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { unsigned Idx = CIdx->getZExtValue(); if (!OriginalDemandedElts[Idx]) return TLO.CombineTo(Op, Vec); KnownBits KnownVec; APInt DemandedVecElts(OriginalDemandedElts); DemandedVecElts.clearBit(Idx); if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, KnownVec, TLO, Depth + 1)) return true; KnownBits KnownScl; unsigned NumSclBits = Scl.getScalarValueSizeInBits(); APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) return true; KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); Known = KnownVec.intersectWith(KnownScl); return false; } break; } case X86ISD::PACKSS: // PACKSS saturates to MIN/MAX integer values. So if we just want the // sign bit then we can just ask for the source operands sign bit. // TODO - add known bits handling. if (OriginalDemandedBits.isSignMask()) { APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS); KnownBits KnownLHS, KnownRHS; APInt SignMask = APInt::getSignMask(BitWidth * 2); if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS, KnownLHS, TLO, Depth + 1)) return true; if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, KnownRHS, TLO, Depth + 1)) return true; // Attempt to avoid multi-use ops if we don't need anything from them. SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1); SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1); if (DemandedOp0 || DemandedOp1) { SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0); SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1); return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1)); } } // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. break; case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); APInt DemandedElts = APInt::getOneBitSet( SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0); if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known, TLO, Depth + 1)) return true; // If we don't need the upper bits, attempt to narrow the broadcast source. // Don't attempt this on AVX512 as it might affect broadcast folding. // TODO: Should we attempt this for i32/i16 splats? They tend to be slower. if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() && OriginalDemandedBits.countl_zero() >= (BitWidth / 2) && Src->hasOneUse()) { MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2); SDValue NewSrc = TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src); MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2); SDValue NewBcst = TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc); return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst)); } break; } case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). // iff we only need the sign bit then we can use R directly. if (OriginalDemandedBits.isSignMask() && ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) return TLO.CombineTo(Op, Op.getOperand(1)); break; case X86ISD::MOVMSK: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); unsigned SrcBits = SrcVT.getScalarSizeInBits(); unsigned NumElts = SrcVT.getVectorNumElements(); // If we don't need the sign bits at all just return zero. if (OriginalDemandedBits.countr_zero() >= NumElts) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); // See if we only demand bits from the lower 128-bit vector. if (SrcVT.is256BitVector() && OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) { SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src)); return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); } // Only demand the vector elements of the sign bits we need. APInt KnownUndef, KnownZero; APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts); if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, TLO, Depth + 1)) return true; Known.Zero = KnownZero.zext(BitWidth); Known.Zero.setHighBits(BitWidth - NumElts); // MOVMSK only uses the MSB from each vector element. KnownBits KnownSrc; APInt DemandedSrcBits = APInt::getSignMask(SrcBits); if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO, Depth + 1)) return true; if (KnownSrc.One[SrcBits - 1]) Known.One.setLowBits(NumElts); else if (KnownSrc.Zero[SrcBits - 1]) Known.Zero.setLowBits(NumElts); // Attempt to avoid multi-use os if we don't need anything from it. if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1)) return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); return false; } case X86ISD::TESTP: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); MVT OpVT = Op0.getSimpleValueType(); assert((OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"); // TESTPS/TESTPD only demands the sign bits of ALL the elements. KnownBits KnownSrc; APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits()); bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode()); return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1, AssumeSingleUse) || SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1, AssumeSingleUse); } case X86ISD::CMOV: { KnownBits Known2; if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits, OriginalDemandedElts, Known2, TLO, Depth + 1)) return true; if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; // Only known if known in both the LHS and RHS. Known = Known.intersectWith(Known2); break; } case X86ISD::BEXTR: case X86ISD::BEXTRI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Only bottom 16-bits of the control bits are required. if (auto *Cst1 = dyn_cast(Op1)) { // NOTE: SimplifyDemandedBits won't do this for constants. uint64_t Val1 = Cst1->getZExtValue(); uint64_t MaskedVal1 = Val1 & 0xFFFF; if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) { SDLoc DL(Op); return TLO.CombineTo( Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0, TLO.DAG.getConstant(MaskedVal1, DL, VT))); } unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); // If the length is 0, the result is 0. if (Length == 0) { Known.setAllZero(); return false; } if ((Shift + Length) <= BitWidth) { APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length); if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1)) return true; Known = Known.extractBits(Length, Shift); Known = Known.zextOrTrunc(BitWidth); return false; } } else { assert(Opc == X86ISD::BEXTR && "Unexpected opcode!"); KnownBits Known1; APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16)); if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1)) return true; // If the length is 0, replace with 0. KnownBits LengthBits = Known1.extractBits(8, 8); if (LengthBits.isZero()) return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); } break; } case X86ISD::PDEP: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero(); APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); // If the demanded bits has leading zeroes, we don't demand those from the // mask. if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) return true; // The number of possible 1s in the mask determines the number of LSBs of // operand 0 used. Undemanded bits from the mask don't matter so filter // them before counting. KnownBits Known2; uint64_t Count = (~Known.Zero & LoMask).popcount(); APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) return true; // Zeroes are retained from the mask, but not ones. Known.One.clearAllBits(); // The result will have at least as many trailing zeros as the non-mask // operand since bits can only map to the same or higher bit position. Known.Zero.setLowBits(Known2.countMinTrailingZeros()); return false; } } return TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const { int NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); switch (Opc) { case X86ISD::PINSRB: case X86ISD::PINSRW: { // If we don't demand the inserted element, return the base vector. SDValue Vec = Op.getOperand(0); auto *CIdx = dyn_cast(Op.getOperand(2)); MVT VecVT = Vec.getSimpleValueType(); if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) && !DemandedElts[CIdx->getZExtValue()]) return Vec; break; } case X86ISD::VSHLI: { // If we are only demanding sign bits then we can use the shift source // directly. SDValue Op0 = Op.getOperand(0); unsigned ShAmt = Op.getConstantOperandVal(1); unsigned BitWidth = DemandedBits.getBitWidth(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero(); if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) return Op0; break; } case X86ISD::VSRAI: // iff we only need the sign bit then we can use the source directly. // TODO: generalize where we only demand extended signbits. if (DemandedBits.isSignMask()) return Op.getOperand(0); break; case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). // iff we only need the sign bit then we can use R directly. if (DemandedBits.isSignMask() && ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) return Op.getOperand(1); break; case X86ISD::BLENDV: { // BLENDV: Cond (MSB) ? LHS : RHS SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1); if (CondKnown.isNegative()) return LHS; if (CondKnown.isNonNegative()) return RHS; break; } case X86ISD::ANDNP: { // ANDNP = (~LHS & RHS); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1); KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1); // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then // the (inverted) LHS bits cannot contribute to the result of the 'andn' in // this context, so return RHS. if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) return RHS; break; } } APInt ShuffleUndef, ShuffleZero; SmallVector ShuffleMask; SmallVector ShuffleOps; if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask, ShuffleUndef, ShuffleZero, DAG, Depth, false)) { // If all the demanded elts are from one operand and are inline, // then we can use the operand directly. int NumOps = ShuffleOps.size(); if (ShuffleMask.size() == (unsigned)NumElts && llvm::all_of(ShuffleOps, [VT](SDValue V) { return VT.getSizeInBits() == V.getValueSizeInBits(); })) { if (DemandedElts.isSubsetOf(ShuffleUndef)) return DAG.getUNDEF(VT); if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero)) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op)); // Bitmask that indicates which ops have only been accessed 'inline'. APInt IdentityOp = APInt::getAllOnes(NumOps); for (int i = 0; i != NumElts; ++i) { int M = ShuffleMask[i]; if (!DemandedElts[i] || ShuffleUndef[i]) continue; int OpIdx = M / NumElts; int EltIdx = M % NumElts; if (M < 0 || EltIdx != i) { IdentityOp.clearAllBits(); break; } IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx); if (IdentityOp == 0) break; } assert((IdentityOp == 0 || IdentityOp.popcount() == 1) && "Multiple identity shuffles detected"); if (IdentityOp != 0) return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]); } } return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const { unsigned NumElts = DemandedElts.getBitWidth(); switch (Op.getOpcode()) { case X86ISD::PSHUFD: case X86ISD::VPERMILPI: { SmallVector Mask; SmallVector Ops; if (getTargetShuffleMask(Op, true, Ops, Mask)) { SmallVector DemandedSrcElts(Ops.size(), APInt::getZero(NumElts)); for (auto M : enumerate(Mask)) { if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero) continue; if (M.value() == SM_SentinelUndef) return false; assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) && "Shuffle mask index out of range"); DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts); } for (auto Op : enumerate(Ops)) if (!DemandedSrcElts[Op.index()].isZero() && !DAG.isGuaranteedNotToBeUndefOrPoison( Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1)) return false; return true; } break; } } return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, Depth); } bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { // SSE vector multiplies are either inbounds or saturate. case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: // SSE vector shifts handle out of bounds shift amounts. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: return false; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: return false; } return TargetLowering::canCreateUndefOrPoisonForTargetNode( Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); } bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); switch (Opc) { case X86ISD::VBROADCAST: case X86ISD::VBROADCAST_LOAD: UndefElts = APInt::getZero(NumElts); return true; } return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts, DAG, Depth); } // Helper to peek through bitops/trunc/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a . static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate) { switch (Src.getOpcode()) { case ISD::TRUNCATE: if (!AllowTruncate) return false; [[fallthrough]]; case ISD::SETCC: return Src.getOperand(0).getValueSizeInBits() == Size; case ISD::FREEZE: return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate); case ISD::AND: case ISD::XOR: case ISD::OR: return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) && checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate); case ISD::SELECT: case ISD::VSELECT: return Src.getOperand(0).getScalarValueSizeInBits() == 1 && checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) && checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate); case ISD::BUILD_VECTOR: return ISD::isBuildVectorAllZeros(Src.getNode()) || ISD::isBuildVectorAllOnes(Src.getNode()); } return false; } // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents. static unsigned getAltBitOpcode(unsigned Opcode) { switch(Opcode) { // clang-format off case ISD::AND: return X86ISD::FAND; case ISD::OR: return X86ISD::FOR; case ISD::XOR: return X86ISD::FXOR; case X86ISD::ANDNP: return X86ISD::FANDN; // clang-format on } llvm_unreachable("Unknown bitwise opcode"); } // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets. static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL) { EVT SrcVT = Src.getValueType(); if (SrcVT != MVT::v4i1) return SDValue(); switch (Src.getOpcode()) { case ISD::SETCC: if (Src.getOperand(0).getValueType() == MVT::v4i32 && ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && cast(Src.getOperand(2))->get() == ISD::SETLT) { SDValue Op0 = Src.getOperand(0); if (ISD::isNormalLoad(Op0.getNode())) return DAG.getBitcast(MVT::v4f32, Op0); if (Op0.getOpcode() == ISD::BITCAST && Op0.getOperand(0).getValueType() == MVT::v4f32) return Op0.getOperand(0); } break; case ISD::AND: case ISD::XOR: case ISD::OR: { SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL); SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL); if (Op0 && Op1) return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0, Op1); break; } } return SDValue(); } // Helper to push sign extension of vXi1 SETCC result through bitops. static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL) { switch (Src.getOpcode()) { case ISD::SETCC: case ISD::FREEZE: case ISD::TRUNCATE: case ISD::BUILD_VECTOR: return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); case ISD::AND: case ISD::XOR: case ISD::OR: return DAG.getNode( Src.getOpcode(), DL, SExtVT, signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL), signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL)); case ISD::SELECT: case ISD::VSELECT: return DAG.getSelect( DL, SExtVT, Src.getOperand(0), signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL), signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL)); } llvm_unreachable("Unexpected node type for vXi1 sign extension"); } // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the illegal vector is scalarized on subtargets that don't have legal // vxi1 types. static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget) { EVT SrcVT = Src.getValueType(); if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type // legalization destroys the v4i32 type. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) { if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) { V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, DAG.getBitcast(MVT::v4f32, V)); return DAG.getZExtOrTrunc(V, DL, VT); } } // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && (Src.getOperand(0).getValueType() == MVT::v16i8 || Src.getOperand(0).getValueType() == MVT::v32i8 || Src.getOperand(0).getValueType() == MVT::v64i8); // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled // directly with vpmovmskb/vmovmskps/vmovmskpd. if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && cast(Src.getOperand(2))->get() == ISD::SETLT && ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) { EVT CmpVT = Src.getOperand(0).getValueType(); EVT EltVT = CmpVT.getVectorElementType(); if (CmpVT.getSizeInBits() <= 256 && (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64)) PreferMovMsk = true; } // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk)) return SDValue(); // If the upper ops of a concatenation are undef, then try to bitcast the // lower op and extend. SmallVector SubSrcOps; if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) && SubSrcOps.size() >= 2) { SDValue LowerOp = SubSrcOps[0]; ArrayRef UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end()); if (LowerOp.getOpcode() == ISD::SETCC && all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) { EVT SubVT = VT.getIntegerVT( *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements()); if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) { EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V)); } } } // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and // v8f64. So all legal 128-bit and 256-bit vectors are covered except for // v8i16 and v16i16. // For these two cases, we can shuffle the upper element bytes to a // consecutive sequence at the start of the vector and treat the results as // v16i8 or v32i8, and for v16i8 this is the preferable solution. However, // for v16i16 this is not the case, because the shuffle is expensive, so we // avoid sign-extending to this type entirely. // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; bool PropagateSExt = false; switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); case MVT::v2i1: SExtVT = MVT::v2i64; break; case MVT::v4i1: SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) { SExtVT = MVT::v4i64; PropagateSExt = true; } break; case MVT::v8i1: SExtVT = MVT::v8i16; // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), // sign-extend to a 256-bit operation to match the compare. // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) || checkBitcastSrcVectorSize(Src, 512, true))) { SExtVT = MVT::v8i32; PropagateSExt = true; } break; case MVT::v16i1: SExtVT = MVT::v16i8; // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), // it is not profitable to sign-extend to 256-bit because this will // require an extra cross-lane shuffle which is more expensive than // truncating the result of the compare to 128-bits. break; case MVT::v32i1: SExtVT = MVT::v32i8; break; case MVT::v64i1: // If we have AVX512F, but not AVX512BW and the input is truncated from // v64i8 checked earlier. Then split the input and make two pmovmskbs. if (Subtarget.hasAVX512()) { if (Subtarget.hasBWI()) return SDValue(); SExtVT = MVT::v64i8; break; } // Split if this is a <64 x i8> comparison result. if (checkBitcastSrcVectorSize(Src, 512, false)) { SExtVT = MVT::v64i8; break; } return SDValue(); }; SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) { V = getPMOVMSKB(DL, V, DAG, Subtarget); } else { if (SExtVT == MVT::v8i16) { V = widenSubVector(V, false, Subtarget, DAG, DL, 256); V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V); } V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); } EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); V = DAG.getZExtOrTrunc(V, DL, IntVT); return DAG.getBitcast(VT, V); } // Convert a vXi1 constant build vector to the same width scalar integer. static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) { EVT SrcVT = Op.getValueType(); assert(SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"); assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && "Expected a constant build vector"); APInt Imm(SrcVT.getVectorNumElements(), 0); for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) { SDValue In = Op.getOperand(Idx); if (!In.isUndef() && (In->getAsZExtVal() & 0x1)) Imm.setBit(Idx); } EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth()); return DAG.getConstant(Imm, SDLoc(Op), IntVT); } static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast"); if (!DCI.isBeforeLegalizeOps()) return SDValue(); // Only do this if we have k-registers. if (!Subtarget.hasAVX512()) return SDValue(); EVT DstVT = N->getValueType(0); SDValue Op = N->getOperand(0); EVT SrcVT = Op.getValueType(); if (!Op.hasOneUse()) return SDValue(); // Look for logic ops. if (Op.getOpcode() != ISD::AND && Op.getOpcode() != ISD::OR && Op.getOpcode() != ISD::XOR) return SDValue(); // Make sure we have a bitcast between mask registers and a scalar type. if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && DstVT.isScalarInteger()) && !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 && SrcVT.isScalarInteger())) return SDValue(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST && LHS.getOperand(0).getValueType() == DstVT) return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0), DAG.getBitcast(DstVT, RHS)); if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST && RHS.getOperand(0).getValueType() == DstVT) return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, DAG.getBitcast(DstVT, LHS), RHS.getOperand(0)); // If the RHS is a vXi1 build vector, this is a good reason to flip too. // Most of these have to move a constant from the scalar domain anyway. if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) { RHS = combinevXi1ConstantToInteger(RHS, DAG); return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, DAG.getBitcast(DstVT, LHS), RHS); } return SDValue(); } static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(BV); unsigned NumElts = BV->getNumOperands(); SDValue Splat = BV->getSplatValue(); // Build MMX element from integer GPR or SSE float values. auto CreateMMXElement = [&](SDValue V) { if (V.isUndef()) return DAG.getUNDEF(MVT::x86mmx); if (V.getValueType().isFloatingPoint()) { if (Subtarget.hasSSE1() && !isa(V)) { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V); V = DAG.getBitcast(MVT::v2i64, V); return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V); } V = DAG.getBitcast(MVT::i32, V); } else { V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32); } return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V); }; // Convert build vector ops to MMX data in the bottom elements. SmallVector Ops; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element. if (Splat) { if (Splat.isUndef()) return DAG.getUNDEF(MVT::x86mmx); Splat = CreateMMXElement(Splat); if (Subtarget.hasSSE1()) { // Unpack v8i8 to splat i8 elements to lowest 16-bits. if (NumElts == 8) Splat = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL, TLI.getPointerTy(DAG.getDataLayout())), Splat, Splat); // Use PSHUFW to repeat 16-bit elements. unsigned ShufMask = (NumElts > 2 ? 0 : 0x44); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, TLI.getPointerTy(DAG.getDataLayout())), Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8)); } Ops.append(NumElts, Splat); } else { for (unsigned i = 0; i != NumElts; ++i) Ops.push_back(CreateMMXElement(BV->getOperand(i))); } // Use tree of PUNPCKLs to build up general MMX vector. while (Ops.size() > 1) { unsigned NumOps = Ops.size(); unsigned IntrinOp = (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd : Intrinsic::x86_mmx_punpcklbw)); SDValue Intrin = DAG.getTargetConstant( IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout())); for (unsigned i = 0; i != NumOps; i += 2) Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin, Ops[i], Ops[i + 1]); Ops.resize(NumOps / 2); } return Ops[0]; } // Recursive function that attempts to find if a bool vector node was originally // a vector/float/double that got truncated/extended/bitcast to/from a scalar // integer. If so, replace the scalar ops with bool vector equivalents back down // the chain. static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth = 0) { if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); // Limit search depth. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned Opc = V.getOpcode(); switch (Opc) { case ISD::BITCAST: { // Bitcast from a vector/float/double, we can cheaply bitcast to VT. SDValue Src = V.getOperand(0); EVT SrcVT = Src.getValueType(); if (SrcVT.isVector() || SrcVT.isFloatingPoint()) return DAG.getBitcast(VT, Src); break; } case ISD::Constant: { auto *C = cast(V); if (C->isZero()) return DAG.getConstant(0, DL, VT); if (C->isAllOnes()) return DAG.getAllOnesConstant(DL, VT); break; } case ISD::TRUNCATE: { // If we find a suitable source, a truncated scalar becomes a subvector. SDValue Src = V.getOperand(0); EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits()); if (TLI.isTypeLegal(NewSrcVT)) if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget, Depth + 1)) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0, DAG.getIntPtrConstant(0, DL)); break; } case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: { // If we find a suitable source, an extended scalar becomes a subvector. SDValue Src = V.getOperand(0); EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getScalarValueSizeInBits()); if (TLI.isTypeLegal(NewSrcVT)) if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget, Depth + 1)) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT) : DAG.getConstant(0, DL, VT), N0, DAG.getIntPtrConstant(0, DL)); break; } case ISD::OR: case ISD::XOR: { // If we find suitable sources, we can just move the op to the vector // domain. if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG, Subtarget, Depth + 1)) if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG, Subtarget, Depth + 1)) return DAG.getNode(Opc, DL, VT, N0, N1); break; } case ISD::SHL: { // If we find a suitable source, a SHL becomes a KSHIFTL. SDValue Src0 = V.getOperand(0); if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) || ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI())) break; if (auto *Amt = dyn_cast(V.getOperand(1))) if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget, Depth + 1)) return DAG.getNode( X86ISD::KSHIFTL, DL, VT, N0, DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8)); break; } } // Does the inner bitcast already exist? if (Depth > 0) if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V})) return SDValue(Alt, 0); return SDValue(); } static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> // (i16 movmsk (16i8 sext (v16i1 x))) // before the setcc result is scalarized on subtargets that don't have legal // vxi1 types. if (DCI.isBeforeLegalize()) { SDLoc dl(N); if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && Subtarget.hasAVX512()) { N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); N0 = DAG.getBitcast(MVT::v8i1, N0); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, DAG.getIntPtrConstant(0, dl)); } // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { // Use zeros for the widening if we already have some zeroes. This can // allow SimplifyDemandedBits to remove scalar ANDs that may be down // stream of this. // FIXME: It might make sense to detect a concat_vectors with a mix of // zeroes and undef and turn it into insert_subvector for i1 vectors as // a separate combine. What we can't do is canonicalize the operands of // such a concat or we'll get into a loop with SimplifyDemandedBits. if (N0.getOpcode() == ISD::CONCAT_VECTORS) { SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1); if (ISD::isBuildVectorAllZeros(LastOp.getNode())) { SrcVT = LastOp.getValueType(); unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector Ops(N0->op_begin(), N0->op_end()); Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT)); N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); N0 = DAG.getBitcast(MVT::i8, N0); return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); } } unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); N0 = DAG.getBitcast(MVT::i8, N0); return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); } } else { // If we're bitcasting from iX to vXi1, see if the integer originally // began as a vXi1 and whether we can remove the bitcast entirely. if (VT.isVector() && VT.getScalarType() == MVT::i1 && SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) { if (SDValue V = combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget)) return V; } } // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur // due to insert_subvector legalization on KNL. By promoting the copy to i16 // we can help with known bits propagation from the vXi1 domain to the // scalar domain. if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() && !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.getOperand(0).getValueType() == MVT::v16i1 && isNullConstant(N0.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, DAG.getBitcast(MVT::i16, N0.getOperand(0))); // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast // and the vbroadcast_load are both integer or both fp. In some cases this // will remove the bitcast entirely. if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) { auto *BCast = cast(N0); unsigned SrcVTSize = SrcVT.getScalarSizeInBits(); unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits(); // Don't swap i8/i16 since don't have fp types that size. if (MemSize >= 32) { MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize) : MVT::getIntegerVT(MemSize); MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize) : MVT::getIntegerVT(SrcVTSize); LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements()); SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, MemVT, BCast->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); return DAG.getBitcast(VT, ResNode); } } // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. if (VT == MVT::x86mmx) { // Detect MMX constant vectors. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) { SDLoc DL(N0); // Handle zero-extension of i32 with MOVD. if (EltBits[0].countl_zero() >= 32) return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT, DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32)); // Else, bitcast to a double. // TODO - investigate supporting sext 32-bit immediates on x86_64. APFloat F64(APFloat::IEEEdouble(), EltBits[0]); return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64)); } // Detect bitcasts to x86mmx low word. if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) && N0.getOperand(0).getValueType() == SrcVT.getScalarType()) { bool LowUndef = true, AllUndefOrZero = true; for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) { SDValue Op = N0.getOperand(i); LowUndef &= Op.isUndef() || (i >= e/2); AllUndefOrZero &= isNullConstantOrUndef(Op); } if (AllUndefOrZero) { SDValue N00 = N0.getOperand(0); SDLoc dl(N00); N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32) : DAG.getZExtOrTrunc(N00, dl, MVT::i32); return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00); } } // Detect bitcasts of 64-bit build vectors and convert to a // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the // lowest element. if (N0.getOpcode() == ISD::BUILD_VECTOR && (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8)) return createMMXBuildVector(cast(N0), DAG, Subtarget); // Detect bitcasts between element or subvector extraction to x86mmx. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0.getOperand(0); if (N00.getValueType().is128BitVector()) return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, DAG.getBitcast(MVT::v2i64, N00)); } // Detect bitcasts from FP_TO_SINT to x86mmx. if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) { SDLoc DL(N0); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, DAG.getUNDEF(MVT::v2i32)); return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, DAG.getBitcast(MVT::v2i64, Res)); } } // Try to remove a bitcast of constant vXi1 vector. We have to legalize // most of these to scalar anyway. if (Subtarget.hasAVX512() && VT.isScalarInteger() && SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { return combinevXi1ConstantToInteger(N0, DAG); } if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() && VT.getVectorElementType() == MVT::i1) { if (auto *C = dyn_cast(N0)) { if (C->isAllOnes()) return DAG.getConstant(1, SDLoc(N0), VT); if (C->isZero()) return DAG.getConstant(0, SDLoc(N0), VT); } } // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1. // Turn it into a sign bit compare that produces a k-register. This avoids // a trip through a GPR. if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && isPowerOf2_32(VT.getVectorNumElements())) { unsigned NumElts = VT.getVectorNumElements(); SDValue Src = N0; // Peek through truncate. if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) Src = N0.getOperand(0); if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) { SDValue MovmskIn = Src.getOperand(0); MVT MovmskVT = MovmskIn.getSimpleValueType(); unsigned MovMskElts = MovmskVT.getVectorNumElements(); // We allow extra bits of the movmsk to be used since they are known zero. // We can't convert a VPMOVMSKB without avx512bw. if (MovMskElts <= NumElts && (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) { EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger(); MovmskIn = DAG.getBitcast(IntVT, MovmskIn); SDLoc dl(N); MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts); SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn, DAG.getConstant(0, dl, IntVT), ISD::SETLT); if (EVT(CmpVT) == VT) return Cmp; // Pad with zeroes up to original VT to replace the zeroes that were // being used from the MOVMSK. unsigned NumConcats = NumElts / MovMskElts; SmallVector Ops(NumConcats, DAG.getConstant(0, dl, CmpVT)); Ops[0] = Cmp; return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops); } } } // Try to remove bitcasts from input and output of mask arithmetic to // remove GPR<->K-register crossings. if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget)) return V; // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the // constant in an integer register and transferring it to an SSE register or // transferring the SSE operand to integer register and back. unsigned FPOpcode; switch (N0.getOpcode()) { // clang-format off case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; default: return SDValue(); // clang-format on } // Check if we have a bitcast from another integer type as well. if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64) || (Subtarget.hasFP16() && VT == MVT::f16) || (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); SDValue LogicOp0 = N0.getOperand(0); SDValue LogicOp1 = N0.getOperand(1); SDLoc DL0(N0); // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST && LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT && !isa(LogicOp0.getOperand(0))) { SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1); unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode(); return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1); } // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y) if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST && LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT && !isa(LogicOp1.getOperand(0))) { SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0); unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode(); return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0); } return SDValue(); } // (mul (zext a), (sext, b)) static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1) { Op0 = Mul.getOperand(0); Op1 = Mul.getOperand(1); // The operand1 should be signed extend if (Op0.getOpcode() == ISD::SIGN_EXTEND) std::swap(Op0, Op1); auto IsFreeTruncation = [](SDValue &Op) -> bool { if ((Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::SIGN_EXTEND) && Op.getOperand(0).getScalarValueSizeInBits() <= 8) return true; auto *BV = dyn_cast(Op); return (BV && BV->isConstant()); }; // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned // value, we need to check Op0 is zero extended value. Op1 should be signed // value, so we just check the signed bits. if ((IsFreeTruncation(Op0) && DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) && (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8)) return true; return false; } // Given a ABS node, detect the following pattern: // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). // This is useful as it is the input into a SAD pattern. static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { SDValue AbsOp1 = Abs->getOperand(0); if (AbsOp1.getOpcode() != ISD::SUB) return false; Op0 = AbsOp1.getOperand(0); Op1 = AbsOp1.getOperand(1); // Check if the operands of the sub are zero-extended from vectors of i8. if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || Op1.getOpcode() != ISD::ZERO_EXTEND || Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) return false; return true; } static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget) { // Extend or truncate to MVT::i8 first. MVT Vi8VT = MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount()); LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT); RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT); // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3]. // The src A, B element type is i8, but the dst C element type is i32. // When we calculate the reduce stage, we use src vector type vXi8 for it // so we need logbias 2 to avoid extra 2 stages. LogBias = 2; unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits()); if (Subtarget.hasVNNI() && !Subtarget.hasVLX()) RegSize = std::max(512u, RegSize); // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we // fill in the missing vector elements with 0. unsigned NumConcat = RegSize / Vi8VT.getSizeInBits(); SmallVector Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT)); Ops[0] = LHS; MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); Ops[0] = RHS; SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); // Actually build the DotProduct, split as 256/512 bits for // AVXVNNI/AVX512VNNI. auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops); }; MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32); SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, DpBuilder, false); } // Given two zexts of to , create a PSADBW of the inputs // to these zexts. static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget) { // Find the appropriate width for the PSADBW. EVT InVT = Zext0.getOperand(0).getValueType(); unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits()); // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we // fill in the missing vector elements with 0. unsigned NumConcat = RegSize / InVT.getSizeInBits(); SmallVector Ops(NumConcat, DAG.getConstant(0, DL, InVT)); Ops[0] = Zext0.getOperand(0); MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); Ops[0] = Zext1.getOperand(0); SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW. auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64); return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops); }; MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 }, PSADBWBuilder); } // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with // PHMINPOSUW. static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE41. if (!Subtarget.hasSSE41()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. ISD::NodeType BinOp; SDValue Src = DAG.matchBinOpReduction( Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true); if (!Src) return SDValue(); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) return SDValue(); SDLoc DL(Extract); SDValue MinPos = Src; // First, reduce the source down to 128-bit, applying BinOp to lo/hi. while (SrcVT.getSizeInBits() > 128) { SDValue Lo, Hi; std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL); SrcVT = Lo.getValueType(); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type"); // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; unsigned MaskEltsBits = ExtractVT.getSizeInBits(); if (BinOp == ISD::SMAX) Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::SMIN) Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::UMAX) Mask = DAG.getAllOnesConstant(DL, SrcVT); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); // For v16i8 cases we need to perform UMIN on pairs of byte elements, // shuffling each upper element down and insert zeros. This means that the // v16i8 UMIN will leave the upper element as zero, performing zero-extension // ready for the PHMINPOS. if (ExtractVT == MVT::i8) { SDValue Upper = DAG.getVectorShuffle( SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8), {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); } // Perform the PHMINPOS on a v8i16 vector, MinPos = DAG.getBitcast(MVT::v8i16, MinPos); MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); MinPos = DAG.getBitcast(SrcVT, MinPos); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, DAG.getIntPtrConstant(0, DL)); } // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE2. if (!Subtarget.hasSSE2()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); unsigned BitWidth = ExtractVT.getSizeInBits(); if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && ExtractVT != MVT::i8 && ExtractVT != MVT::i1) return SDValue(); // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns. ISD::NodeType BinOp; SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); if (!Match && ExtractVT == MVT::i1) Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR}); if (!Match) return SDValue(); // EXTRACT_VECTOR_ELT can require implicit extension of the vector element // which we can't support here for now. if (Match.getScalarValueSizeInBits() != BitWidth) return SDValue(); SDValue Movmsk; SDLoc DL(Extract); EVT MatchVT = Match.getValueType(); unsigned NumElts = MatchVT.getVectorNumElements(); unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext &Ctx = *DAG.getContext(); if (ExtractVT == MVT::i1) { // Special case for (pre-legalization) vXi1 reductions. if (NumElts > 64 || !isPowerOf2_32(NumElts)) return SDValue(); if (Match.getOpcode() == ISD::SETCC) { ISD::CondCode CC = cast(Match.getOperand(2))->get(); if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) || (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) { // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y. // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y. X86::CondCode X86CC; SDValue LHS = DAG.getFreeze(Match.getOperand(0)); SDValue RHS = DAG.getFreeze(Match.getOperand(1)); APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits()); if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget, DAG, X86CC)) return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT, getSETCC(X86CC, V, DL, DAG)); } } if (TLI.isTypeLegal(MatchVT)) { // If this is a legal AVX512 predicate type then we can just bitcast. EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { // Use combineBitcastvxi1 to create the MOVMSK. while (NumElts > MaxElts) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); NumElts /= 2; } EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts); Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); } if (!Movmsk) return SDValue(); Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); } else { // FIXME: Better handling of k-registers or 512-bit vectors? unsigned MatchSizeInBits = Match.getValueSizeInBits(); if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && Subtarget.hasAVX()))) return SDValue(); // Make sure this isn't a vector of 1 element. The perf win from using // MOVMSK diminishes with less elements in the reduction, but it is // generally better to get the comparison over to the GPRs as soon as // possible to reduce the number of vector ops. if (Match.getValueType().getVectorNumElements() < 2) return SDValue(); // Check that we are extracting a reduction of all sign bits. if (DAG.ComputeNumSignBits(Match) != BitWidth) return SDValue(); if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); MatchSizeInBits = Match.getValueSizeInBits(); } // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. MVT MaskSrcVT; if (64 == BitWidth || 32 == BitWidth) MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), MatchSizeInBits / BitWidth); else MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); NumElts = MaskSrcVT.getVectorNumElements(); } assert((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"); MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32; if (BinOp == ISD::XOR) { // parity -> (PARITY(MOVMSK X)) SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk); return DAG.getZExtOrTrunc(Result, DL, ExtractVT); } SDValue CmpC; ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 CmpC = DAG.getConstant(0, DL, CmpVT); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts), DL, CmpVT); CondCode = ISD::CondCode::SETEQ; } // The setcc produces an i8 of 0/1, so extend that to the result width and // negate to get the final 0/-1 mask value. EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT); SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); return DAG.getNegative(Zext, DL, ExtractVT); } static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); // Verify the type we're extracting is i32, as the output element type of // vpdpbusd is i32. if (ExtractVT != MVT::i32) return SDValue(); EVT VT = Extract->getOperand(0).getValueType(); if (!isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); // Match shuffle + add pyramid. ISD::NodeType BinOp; SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD}); // We can't combine to vpdpbusd for zext, because each of the 4 multiplies // done by vpdpbusd compute a signed 16-bit product that will be sign extended // before adding into the accumulator. // TODO: // We also need to verify that the multiply has at least 2x the number of bits // of the input. We shouldn't match // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))). // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND)) // Root = Root.getOperand(0); // If there was a match, we want Root to be a mul. if (!Root || Root.getOpcode() != ISD::MUL) return SDValue(); // Check whether we have an extend and mul pattern SDValue LHS, RHS; if (!detectExtMul(DAG, Root, LHS, RHS)) return SDValue(); // Create the dot product instruction. SDLoc DL(Extract); unsigned StageBias; SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget); // If the original vector was wider than 4 elements, sum over the results // in the DP vector. unsigned Stages = Log2_32(VT.getVectorNumElements()); EVT DpVT = DP.getValueType(); if (Stages > StageBias) { unsigned DpElems = DpVT.getVectorNumElements(); for (unsigned i = Stages - StageBias; i > 0; --i) { SmallVector Mask(DpElems, -1); for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) Mask[j] = MaskEnd + j; SDValue Shuffle = DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask); DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle); } } // Return the lowest ExtractSizeInBits bits. EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT, DpVT.getSizeInBits() / ExtractVT.getSizeInBits()); DP = DAG.getBitcast(ResVT, DP); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP, Extract->getOperand(1)); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. if (!Subtarget.hasSSE2()) return SDValue(); EVT ExtractVT = Extract->getValueType(0); // Verify the type we're extracting is either i32 or i64. // FIXME: Could support other types, but this is what we have coverage for. if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64) return SDValue(); EVT VT = Extract->getOperand(0).getValueType(); if (!isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); // Match shuffle + add pyramid. ISD::NodeType BinOp; SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD}); // The operand is expected to be zero extended from i8 // (verified in detectZextAbsDiff). // In order to convert to i64 and above, additional any/zero/sign // extend is expected. // The zero extend from 32 bit has no mathematical effect on the result. // Also the sign extend is basically zero extend // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || Root.getOpcode() == ISD::ZERO_EXTEND || Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || Root.getOpcode() != ISD::ABS) return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. SDValue Zext0, Zext1; if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget); // If the original vector was wider than 8 elements, sum over the results // in the SAD vector. unsigned Stages = Log2_32(VT.getVectorNumElements()); EVT SadVT = SAD.getValueType(); if (Stages > 3) { unsigned SadElems = SadVT.getVectorNumElements(); for(unsigned i = Stages - 3; i > 0; --i) { SmallVector Mask(SadElems, -1); for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) Mask[j] = MaskEnd + j; SDValue Shuffle = DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask); SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle); } } unsigned ExtractSizeInBits = ExtractVT.getSizeInBits(); // Return the lowest ExtractSizeInBits bits. EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT, SadVT.getSizeInBits() / ExtractSizeInBits); SAD = DAG.getBitcast(ResVT, SAD); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD, Extract->getOperand(1)); } // If this extract is from a loaded vector value and will be used as an // integer, that requires a potentially expensive XMM -> GPR transfer. // Additionally, if we can convert to a scalar integer load, that will likely // be folded into a subsequent integer op. // Note: SrcVec might not have a VecVT type, but it must be the same size. // Note: Unlike the related fold for this in DAGCombiner, this is not limited // to a single-use of the loaded vector. For the reasons above, we // expect this to be profitable even if it creates an extra load. static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Only EXTRACT_VECTOR_ELT supported so far"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) { return Use->getOpcode() == ISD::STORE || Use->getOpcode() == ISD::INSERT_VECTOR_ELT || Use->getOpcode() == ISD::SCALAR_TO_VECTOR; }); auto *LoadVec = dyn_cast(SrcVec); if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() && VecVT.getVectorElementType() == VT && VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() && DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) { SDValue NewPtr = TLI.getVectorElementPointer( DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl)); unsigned PtrOff = VT.getSizeInBits() * Idx / 8; MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff); Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff); SDValue Load = DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment, LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo()); DAG.makeEquivalentMemoryOrdering(LoadVec, Load); return Load; } return SDValue(); } // Attempt to peek through a target shuffle and extract the scalar from the // source. static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDLoc dl(N); SDValue Src = N->getOperand(0); SDValue Idx = N->getOperand(1); EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getVectorElementType(); unsigned SrcEltBits = SrcSVT.getSizeInBits(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); // Don't attempt this for boolean mask vectors or unknown extraction indices. if (SrcSVT == MVT::i1 || !isa(Idx)) return SDValue(); const APInt &IdxC = N->getConstantOperandAPInt(1); if (IdxC.uge(NumSrcElts)) return SDValue(); SDValue SrcBC = peekThroughBitcasts(Src); // Handle extract(bitcast(broadcast(scalar_value))). if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { SDValue SrcOp = SrcBC.getOperand(0); EVT SrcOpVT = SrcOp.getValueType(); if (SrcOpVT.isScalarInteger() && VT.isInteger() && (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) { unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits; unsigned Offset = IdxC.urem(Scale) * SrcEltBits; // TODO support non-zero offsets. if (Offset == 0) { SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType()); SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT); return SrcOp; } } } // If we're extracting a single element from a broadcast load and there are // no other users, just create a single load. if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { auto *MemIntr = cast(SrcBC); unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) { SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(), MemIntr->getPointerInfo(), MemIntr->getOriginalAlign(), MemIntr->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); return Load; } } // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers. // TODO: Move to DAGCombine? if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() && SrcBC.getValueType().isInteger() && (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 && SrcBC.getScalarValueSizeInBits() == SrcBC.getOperand(0).getValueSizeInBits()) { unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits; if (IdxC.ult(Scale)) { unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits(); SDValue Scl = SrcBC.getOperand(0); EVT SclVT = Scl.getValueType(); if (Offset) { Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl, DAG.getShiftAmountConstant(Offset, SclVT, dl)); } Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType()); Scl = DAG.getZExtOrTrunc(Scl, dl, VT); return Scl; } } // Handle extract(truncate(x)) for 0'th index. // TODO: Treat this as a faux shuffle? // TODO: When can we use this for general indices? if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 && (SrcVT.getSizeInBits() % 128) == 0) { Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits); return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src), Idx); } // We can only legally extract other elements from 128-bit vectors and in // certain circumstances, depending on SSE-level. // TODO: Investigate float/double extraction if it will be just stored. auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT, unsigned Idx) { EVT VecSVT = VecVT.getScalarType(); if ((VecVT.is256BitVector() || VecVT.is512BitVector()) && (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 || VecSVT == MVT::i64)) { unsigned EltSizeInBits = VecSVT.getSizeInBits(); unsigned NumEltsPerLane = 128 / EltSizeInBits; unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits; unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits(); VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane); Vec = extract128BitVector(Vec, LaneIdx, DAG, dl); Idx &= (NumEltsPerLane - 1); } if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) && ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(), DAG.getBitcast(VecVT, Vec), DAG.getIntPtrConstant(Idx, dl)); } if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) || (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) { unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec), DAG.getTargetConstant(Idx, dl, MVT::i8)); } return SDValue(); }; // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Shuffle inputs must be the same size as the result. if (llvm::any_of(Ops, [SrcVT](SDValue Op) { return SrcVT.getSizeInBits() != Op.getValueSizeInBits(); })) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. if (Mask.size() != NumSrcElts) { if ((NumSrcElts % Mask.size()) == 0) { SmallVector ScaledMask; int Scale = NumSrcElts / Mask.size(); narrowShuffleMaskElts(Scale, Mask, ScaledMask); Mask = std::move(ScaledMask); } else if ((Mask.size() % NumSrcElts) == 0) { // Simplify Mask based on demanded element. int ExtractIdx = (int)IdxC.getZExtValue(); int Scale = Mask.size() / NumSrcElts; int Lo = Scale * ExtractIdx; int Hi = Scale * (ExtractIdx + 1); for (int i = 0, e = (int)Mask.size(); i != e; ++i) if (i < Lo || Hi <= i) Mask[i] = SM_SentinelUndef; SmallVector WidenedMask; while (Mask.size() > NumSrcElts && canWidenShuffleElements(Mask, WidenedMask)) Mask = std::move(WidenedMask); } } // If narrowing/widening failed, see if we can extract+zero-extend. int ExtractIdx; EVT ExtractVT; if (Mask.size() == NumSrcElts) { ExtractIdx = Mask[IdxC.getZExtValue()]; ExtractVT = SrcVT; } else { unsigned Scale = Mask.size() / NumSrcElts; if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint()) return SDValue(); unsigned ScaledIdx = Scale * IdxC.getZExtValue(); if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1)) return SDValue(); ExtractIdx = Mask[ScaledIdx]; EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale); ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size()); assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && "Failed to widen vector type"); } // If the shuffle source element is undef/zero then we can just accept it. if (ExtractIdx == SM_SentinelUndef) return DAG.getUNDEF(VT); if (ExtractIdx == SM_SentinelZero) return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) : DAG.getConstant(0, dl, VT); SDValue SrcOp = Ops[ExtractIdx / Mask.size()]; ExtractIdx = ExtractIdx % Mask.size(); if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx)) return DAG.getZExtOrTrunc(V, dl, VT); if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT) if (SDValue V = combineExtractFromVectorLoad( N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI)) return V; return SDValue(); } /// Extracting a scalar FP value from vector element 0 is free, so extract each /// operand first, then perform the math as a scalar op. static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); SDValue Vec = ExtElt->getOperand(0); SDValue Index = ExtElt->getOperand(1); EVT VT = ExtElt->getValueType(0); EVT VecVT = Vec.getValueType(); // TODO: If this is a unary/expensive/expand op, allow extraction from a // non-zero element because the shuffle+scalar op will be cheaper? if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) return SDValue(); // Vector FP compares don't fit the pattern of FP math ops (propagate, not // extract, the condition code), so deal with those as a special-case. if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) { EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); if (OpVT != MVT::f32 && OpVT != MVT::f64) return SDValue(); // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC SDLoc DL(ExtElt); SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Vec.getOperand(0), Index); SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Vec.getOperand(1), Index); return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); } if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Vector FP selects don't fit the pattern of FP math ops (because the // condition has a different type and we have to change the opcode), so deal // with those here. // FIXME: This is restricted to pre type legalization by ensuring the setcc // has i1 elements. If we loosen this we need to convert vector bool to a // scalar bool. if (Vec.getOpcode() == ISD::VSELECT && Vec.getOperand(0).getOpcode() == ISD::SETCC && Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 && Vec.getOperand(0).getOperand(0).getValueType() == VecVT) { // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0) SDLoc DL(ExtElt); SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Vec.getOperand(0).getValueType().getScalarType(), Vec.getOperand(0), Index); SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(1), Index); SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(2), Index); return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); } // TODO: This switch could include FNEG and the x86-specific FP logic ops // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid // missed load folding and fma+fneg combining. switch (Vec.getOpcode()) { case ISD::FMA: // Begin 3 operands case ISD::FMAD: case ISD::FADD: // Begin 2 operands case ISD::FSUB: case ISD::FMUL: case ISD::FDIV: case ISD::FREM: case ISD::FCOPYSIGN: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: case ISD::FMAXIMUM: case ISD::FMINIMUM: case X86ISD::FMAX: case X86ISD::FMIN: case ISD::FABS: // Begin 1 operand case ISD::FSQRT: case ISD::FRINT: case ISD::FCEIL: case ISD::FTRUNC: case ISD::FNEARBYINT: case ISD::FROUNDEVEN: case ISD::FROUND: case ISD::FFLOOR: case X86ISD::FRCP: case X86ISD::FRSQRT: { // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... SDLoc DL(ExtElt); SmallVector ExtOps; for (SDValue Op : Vec->ops()) ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); } default: return SDValue(); } llvm_unreachable("All opcodes should return within switch"); } /// Try to convert a vector reduction sequence composed of binops and shuffles /// into horizontal ops. static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); // We need at least SSE2 to anything here. if (!Subtarget.hasSSE2()) return SDValue(); ISD::NodeType Opc; SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::MUL, ISD::FADD}, true); if (!Rdx) return SDValue(); SDValue Index = ExtElt->getOperand(1); assert(isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"); EVT VT = ExtElt->getValueType(0); EVT VecVT = Rdx.getValueType(); if (VecVT.getScalarType() != VT) return SDValue(); SDLoc DL(ExtElt); unsigned NumElts = VecVT.getVectorNumElements(); unsigned EltSizeInBits = VecVT.getScalarSizeInBits(); // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits. auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) { if (V.getValueType() == MVT::v4i8) { if (ZeroExtend && Subtarget.hasSSE41()) { V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, DAG.getConstant(0, DL, MVT::v4i32), DAG.getBitcast(MVT::i32, V), DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(MVT::v16i8, V); } V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V, ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8) : DAG.getUNDEF(MVT::v4i8)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V, DAG.getUNDEF(MVT::v8i8)); }; // vXi8 mul reduction - promote to vXi16 mul reduction. if (Opc == ISD::MUL) { if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts)) return SDValue(); if (VecVT.getSizeInBits() >= 128) { EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2); SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); Lo = DAG.getBitcast(WideVT, Lo); Hi = DAG.getBitcast(WideVT, Hi); Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi); while (Rdx.getValueSizeInBits() > 128) { std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi); } } else { Rdx = WidenToV16I8(Rdx, false); Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8)); Rdx = DAG.getBitcast(MVT::v8i16, Rdx); } if (NumElts >= 8) Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, {4, 5, 6, 7, -1, -1, -1, -1})); Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, {2, 3, -1, -1, -1, -1, -1, -1})); Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, {1, -1, -1, -1, -1, -1, -1, -1})); Rdx = DAG.getBitcast(MVT::v16i8, Rdx); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } // vXi8 add reduction - sub 128-bit vector. if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) { Rdx = WidenToV16I8(Rdx, true); Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, DAG.getConstant(0, DL, MVT::v16i8)); Rdx = DAG.getBitcast(MVT::v16i8, Rdx); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } // Must be a >=128-bit vector with pow2 elements. if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts)) return SDValue(); // vXi8 add reduction - sum lo/hi halves then use PSADBW. if (VT == MVT::i8) { while (Rdx.getValueSizeInBits() > 128) { SDValue Lo, Hi; std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); VecVT = Lo.getValueType(); Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi); } assert(VecVT == MVT::v16i8 && "v16i8 reduction expected"); SDValue Hi = DAG.getVectorShuffle( MVT::v16i8, DL, Rdx, Rdx, {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi); Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, getZeroVector(MVT::v16i8, Subtarget, DAG, DL)); Rdx = DAG.getBitcast(MVT::v16i8, Rdx); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } // See if we can use vXi8 PSADBW add reduction for larger zext types. // If the source vector values are 0-255, then we can use PSADBW to // sum+zext v8i8 subvectors to vXi64, then perform the reduction. // TODO: See if its worth avoiding vXi16/i32 truncations? if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 && DAG.computeKnownBits(Rdx).getMaxValue().ule(255) && (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND || Subtarget.hasAVX512())) { if (Rdx.getValueType() == MVT::v8i16) { Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v8i16)); } else { EVT ByteVT = VecVT.changeVectorElementType(MVT::i8); Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx); if (ByteVT.getSizeInBits() < 128) Rdx = WidenToV16I8(Rdx, true); } // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW. auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64); SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType()); return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero); }; MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64); Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder); // TODO: We could truncate to vXi16/vXi32 before performing the reduction. while (Rdx.getValueSizeInBits() > 128) { SDValue Lo, Hi; std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); VecVT = Lo.getValueType(); Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi); } assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected"); if (NumElts > 8) { SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1}); Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi); } VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits()); Rdx = DAG.getBitcast(VecVT, Rdx); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. if (!shouldUseHorizontalOp(true, DAG, Subtarget)) return SDValue(); unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; // 256-bit horizontal instructions operate on 128-bit chunks rather than // across the whole vector, so we need an extract + hop preliminary stage. // This is the only step where the operands of the hop are not the same value. // TODO: We could extend this to handle 512-bit or even longer vectors. if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { unsigned NumElts = VecVT.getVectorNumElements(); SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo); VecVT = Rdx.getValueType(); } if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) return SDValue(); // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); for (unsigned i = 0; i != ReductionSteps; ++i) Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); } /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading /// scalars back, while for x64 we should use 64-bit extracts and shifts. static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; SDValue InputVector = N->getOperand(0); SDValue EltIdx = N->getOperand(1); auto *CIdx = dyn_cast(EltIdx); EVT SrcVT = InputVector.getValueType(); EVT VT = N->getValueType(0); SDLoc dl(InputVector); bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumEltBits = VT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts)) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); // Integer Constant Folding. if (CIdx && VT.isInteger()) { APInt UndefVecElts; SmallVector EltBits; unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits(); if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)) { uint64_t Idx = CIdx->getZExtValue(); if (UndefVecElts[Idx]) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT); } // Convert extract_element(bitcast() -> bitcast(extract_subvector()). // Improves lowering of bool masks on rust which splits them into byte array. if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) { SDValue Src = peekThroughBitcasts(InputVector); if (Src.getValueType().getScalarType() == MVT::i1 && TLI.isTypeLegal(Src.getValueType())) { MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits); SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src, DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl)); return DAG.getBitcast(VT, Sub); } } } if (IsPextr) { if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits), DCI)) return SDValue(N, 0); // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling). if ((InputVector.getOpcode() == X86ISD::PINSRB || InputVector.getOpcode() == X86ISD::PINSRW) && InputVector.getOperand(2) == EltIdx) { assert(SrcVT == InputVector.getOperand(0).getValueType() && "Vector type mismatch"); SDValue Scl = InputVector.getOperand(1); Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl); return DAG.getZExtOrTrunc(Scl, dl, VT); } // TODO - Remove this once we can handle the implicit zero-extension of // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and // combineBasicSADPattern. return SDValue(); } // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (VT == MVT::i64 && SrcVT == MVT::v1i64 && InputVector.getOpcode() == ISD::BITCAST && InputVector.getOperand(0).getValueType() == MVT::x86mmx && isNullConstant(EltIdx) && InputVector.hasOneUse()) return DAG.getBitcast(VT, InputVector); // Detect mmx to i32 conversion through a v2i32 elt extract. if (VT == MVT::i32 && SrcVT == MVT::v2i32 && InputVector.getOpcode() == ISD::BITCAST && InputVector.getOperand(0).getValueType() == MVT::x86mmx && isNullConstant(EltIdx) && InputVector.hasOneUse()) return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, InputVector.getOperand(0)); // Check whether this extract is the root of a sum of absolute differences // pattern. This has to be done here because we really want it to happen // pre-legalization, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget)) return VPDPBUSD; // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) return Cmp; // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget)) return MinMax; // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc.. if (SDValue V = combineArithReduction(N, DAG, Subtarget)) return V; if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget)) return V; if (CIdx) if (SDValue V = combineExtractFromVectorLoad( N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(), dl, DAG, DCI)) return V; // Attempt to extract a i1 element by using MOVMSK to extract the signbits // and then testing the relevant element. // // Note that we only combine extracts on the *same* result number, i.e. // t0 = merge_values a0, a1, a2, a3 // i1 = extract_vector_elt t0, Constant:i64<2> // i1 = extract_vector_elt t0, Constant:i64<3> // but not // i1 = extract_vector_elt t0:1, Constant:i64<2> // since the latter would need its own MOVMSK. if (SrcVT.getScalarType() == MVT::i1) { bool IsVar = !CIdx; SmallVector BoolExtracts; unsigned ResNo = InputVector.getResNo(); auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) { if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && Use->getOperand(0).getResNo() == ResNo && Use->getValueType(0) == MVT::i1) { BoolExtracts.push_back(Use); IsVar |= !isa(Use->getOperand(1)); return true; } return false; }; // TODO: Can we drop the oneuse check for constant extracts? if (all_of(InputVector->uses(), IsBoolExtract) && (IsVar || BoolExtracts.size() > 1)) { EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); if (SDValue BC = combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { for (SDNode *Use : BoolExtracts) { // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask // Mask = 1 << MaskIdx SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8); SDValue MaskBit = DAG.getConstant(1, dl, BCVT); SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx); SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); DCI.CombineTo(Use, Res); } return SDValue(N, 0); } } } // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)). if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) { SDValue TruncSrc = InputVector.getOperand(0); EVT TruncSVT = TruncSrc.getValueType().getScalarType(); if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) { SDValue NewExt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx); return DAG.getAnyExtOrTrunc(NewExt, dl, VT); } } return SDValue(); } // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). // This is more or less the reverse of combineBitcastvxi1. static SDValue combineToExtendBoolVectorInReg( unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && Opcode != ISD::ANY_EXTEND) return SDValue(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); EVT SVT = VT.getScalarType(); EVT InSVT = N0.getValueType().getScalarType(); unsigned EltSizeInBits = SVT.getSizeInBits(); // Input type must be extending a bool vector (bit-casted from a scalar // integer) to legal integer types. if (!VT.isVector()) return SDValue(); if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) return SDValue(); if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) return SDValue(); SDValue N00 = N0.getOperand(0); EVT SclVT = N00.getValueType(); if (!SclVT.isScalarInteger()) return SDValue(); SDValue Vec; SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"); // Broadcast the scalar integer to the vector elements. if (NumElts > EltSizeInBits) { // If the scalar integer is greater than the vector element size, then we // must split it down into sub-sections for broadcasting. For example: // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"); unsigned Scale = NumElts / EltSizeInBits; EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); Vec = DAG.getBitcast(VT, Vec); for (unsigned i = 0; i != Scale; ++i) ShuffleMask.append(EltSizeInBits, i); Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits && (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) { // If we have register broadcast instructions, use the scalar size as the // element type for the shuffle. Then cast to the wider element type. The // widened bits won't be used, and this might allow the use of a broadcast // load. assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"); unsigned Scale = EltSizeInBits / NumElts; EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); ShuffleMask.append(NumElts * Scale, 0); Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask); Vec = DAG.getBitcast(VT, Vec); } else { // For smaller scalar integers, we can simply any-extend it to the vector // element size (we don't care about the upper bits) and broadcast it to all // elements. SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); ShuffleMask.append(NumElts, 0); Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); } // Now, mask the relevant bit in each element. SmallVector Bits; for (unsigned i = 0; i != NumElts; ++i) { int BitIdx = (i % EltSizeInBits); APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); Bits.push_back(DAG.getConstant(Bit, DL, SVT)); } SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); // Compare against the bitmask and extend the result. EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); Vec = DAG.getSExtOrTrunc(Vec, DL, VT); // For SEXT, this is now done, otherwise shift the result down for // zero-extension. if (Opcode == ISD::SIGN_EXTEND) return Vec; return DAG.getNode(ISD::SRL, DL, VT, Vec, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (N->getOpcode() != ISD::VSELECT) return SDValue(); assert(CondVT.isVector() && "Vector select expects a vector selector!"); // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? // TODO: Can we assert that both operands are not zeros (because that should // get simplified at node creation time)? bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); // If both inputs are 0/undef, create a complete zero vector. // FIXME: As noted above this should be handled by DAGCombiner/getNode. if (TValIsAllZeros && FValIsAllZeros) { if (VT.isFloatingPoint()) return DAG.getConstantFP(0.0, DL, VT); return DAG.getConstant(0, DL, VT); } // To use the condition operand as a bitwise mask, it must have elements that // are the same size as the select elements. Ie, the condition operand must // have already been promoted from the IR select condition type . // Don't check if the types themselves are equal because that excludes // vector floating-point selects. if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); // Try to invert the condition if true value is not all 1s and false value is // not all 0s. Only do this if the condition has one use. bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() && // Check if the selector will be produced by CMPP*/PCMP*. Cond.getOpcode() == ISD::SETCC && // Check if SETCC has already been promoted. TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == CondVT) { bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); ISD::CondCode NewCC = ISD::getSetCCInverse( cast(CC)->get(), Cond.getOperand(0).getValueType()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); TValIsAllOnes = FValIsAllOnes; FValIsAllZeros = TValIsAllZeros; } } // Cond value must be 'sign splat' to be converted to a logical op. if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits()) return SDValue(); // vselect Cond, 111..., 000... -> Cond if (TValIsAllOnes && FValIsAllZeros) return DAG.getBitcast(VT, Cond); if (!TLI.isTypeLegal(CondVT)) return SDValue(); // vselect Cond, 111..., X -> or Cond, X if (TValIsAllOnes) { SDValue CastRHS = DAG.getBitcast(CondVT, RHS); SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS); return DAG.getBitcast(VT, Or); } // vselect Cond, X, 000... -> and Cond, X if (FValIsAllZeros) { SDValue CastLHS = DAG.getBitcast(CondVT, LHS); SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS); return DAG.getBitcast(VT, And); } // vselect Cond, 000..., X -> andn Cond, X if (TValIsAllZeros) { SDValue CastRHS = DAG.getBitcast(CondVT, RHS); SDValue AndN; // The canonical form differs for i1 vectors - x86andnp is not used if (CondVT.getScalarType() == MVT::i1) AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT), CastRHS); else AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS); return DAG.getBitcast(VT, AndN); } return SDValue(); } /// If both arms of a vector select are concatenated vectors, split the select, /// and concatenate the result to eliminate a wide (256-bit) vector instruction: /// vselect Cond, (concat T0, T1), (concat F0, F1) --> /// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) return SDValue(); // TODO: Split 512-bit vectors too? EVT VT = N->getValueType(0); if (!VT.is256BitVector()) return SDValue(); // TODO: Split as long as any 2 of the 3 operands are concatenated? SDValue Cond = N->getOperand(0); SDValue TVal = N->getOperand(1); SDValue FVal = N->getOperand(2); if (!TVal.hasOneUse() || !FVal.hasOneUse() || !isFreeToSplitVector(TVal.getNode(), DAG) || !isFreeToSplitVector(FVal.getNode(), DAG)) return SDValue(); auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend, /*CheckBWI*/ false); } static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL) { SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); auto *TrueC = dyn_cast(LHS); auto *FalseC = dyn_cast(RHS); if (!TrueC || !FalseC) return SDValue(); // Don't do this for crazy integer types. EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // We're going to use the condition bit in math or logic ops. We could allow // this with a wider condition value (post-legalization it becomes an i8), // but if nothing is creating selects that late, it doesn't matter. if (Cond.getValueType() != MVT::i1) return SDValue(); // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by // 3, 5, or 9 with i32/i64, so those get transformed too. // TODO: For constants that overflow or do not differ by power-of-2 or small // multiplier, convert to 'and' + 'add'. const APInt &TrueVal = TrueC->getAPIntValue(); const APInt &FalseVal = FalseC->getAPIntValue(); // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB. if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) && Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); if (CC == ISD::SETEQ || CC == ISD::SETNE) return SDValue(); } bool OV; APInt Diff = TrueVal.ssub_ov(FalseVal, OV); if (OV) return SDValue(); APInt AbsDiff = Diff.abs(); if (AbsDiff.isPowerOf2() || ((VT == MVT::i32 || VT == MVT::i64) && (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) { // We need a positive multiplier constant for shift/LEA codegen. The 'not' // of the condition can usually be folded into a compare predicate, but even // without that, the sequence should be cheaper than a CMOV alternative. if (TrueVal.slt(FalseVal)) { Cond = DAG.getNOT(DL, Cond, MVT::i1); std::swap(TrueC, FalseC); } // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); // Multiply condition by the difference if non-one. if (!AbsDiff.isOne()) R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT)); // Add the base if non-zero. if (!FalseC->isZero()) R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0)); return R; } return SDValue(); } /// If this is a *dynamic* select (non-constant condition) and we can match /// this node with one of the variable blend instructions, restructure the /// condition so that blends can use the high (sign) bit of each element. /// This function will also call SimplifyDemandedBits on already created /// BLENDV to perform additional simplifications. static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Cond = N->getOperand(0); if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) || ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned BitWidth = Cond.getScalarValueSizeInBits(); EVT VT = N->getValueType(0); // We can only handle the cases where VSELECT is directly legal on the // subtarget. We custom lower VSELECT nodes with constant conditions and // this makes it hard to see whether a dynamic VSELECT will correctly // lower, so we both check the operation's status and explicitly handle the // cases where a *dynamic* blend will fail even though a constant-condition // blend could be custom lowered. // FIXME: We should find a better way to handle this class of problems. // Potentially, we should combine constant-condition vselect nodes // pre-legalization into shuffles and not mark as many types as custom // lowered. if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. if (VT.is128BitVector() && !Subtarget.hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) return SDValue(); // There are no 512-bit blend instructions that use sign bits. if (VT.is512BitVector()) return SDValue(); // Don't optimize before the condition has been transformed to a legal type // and don't ever optimize vector selects that map to AVX512 mask-registers. if (BitWidth < 8 || BitWidth > 64) return SDValue(); auto OnlyUsedAsSelectCond = [](SDValue Cond) { for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); UI != UE; ++UI) if ((UI->getOpcode() != ISD::VSELECT && UI->getOpcode() != X86ISD::BLENDV) || UI.getOperandNo() != 0) return false; return true; }; APInt DemandedBits(APInt::getSignMask(BitWidth)); if (OnlyUsedAsSelectCond(Cond)) { KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true)) return SDValue(); // If we changed the computation somewhere in the DAG, this change will // affect all users of Cond. Update all the nodes so that we do not use // the generic VSELECT anymore. Otherwise, we may perform wrong // optimizations as we messed with the actual expectation for the vector // boolean values. for (SDNode *U : Cond->uses()) { if (U->getOpcode() == X86ISD::BLENDV) continue; SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), Cond, U->getOperand(1), U->getOperand(2)); DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); DCI.AddToWorklist(U); } DCI.CommitTargetLoweringOpt(TLO); return SDValue(N, 0); } // Otherwise we can still at least try to simplify multiple use bits. if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG)) return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V, N->getOperand(1), N->getOperand(2)); return SDValue(); } // Try to match: // (or (and (M, (sub 0, X)), (pandn M, X))) // which is a special case of: // (select M, (sub 0, X), X) // Per: // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate // We know that, if fNegate is 0 or 1: // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) // // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) // ( M ? -X : X) == ((X ^ M ) + (M & 1)) // This lets us transform our vselect to: // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) static SDValue combineLogicBlendIntoConditionalNegate( EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT MaskVT = Mask.getValueType(); assert(MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"); if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) return SDValue(); if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) return SDValue(); auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); }; SDValue V; if (IsNegV(Y.getNode(), X)) V = X; else if (IsNegV(X.getNode(), Y)) V = Y; else return SDValue(); SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; // If the negate was on the false side of the select, then // the operands of the SUB need to be swapped. PR 27251. // This is because the pattern being matched above is // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) // but if the pattern matched was // (vselect M, X, (sub (0, X))), that is really negation of the pattern // above, -(vselect M, (sub 0, X), X), and therefore the replacement // pattern also needs to be a negation of the replacement pattern above. // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the // sub accomplishes the negation of the replacement pattern. if (V == Y) std::swap(SubOp1, SubOp2); SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); return DAG.getBitcast(VT, Res); } static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { if (!Subtarget.hasAVX512()) return SDValue(); if (N->getOpcode() != ISD::VSELECT) return SDValue(); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); if (canCombineAsMaskOperation(LHS, Subtarget)) return SDValue(); if (!canCombineAsMaskOperation(RHS, Subtarget)) return SDValue(); if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) return SDValue(); // Commute LHS and RHS to create opportunity to select mask instruction. // (vselect M, L, R) -> (vselect ~M, R, L) ISD::CondCode NewCC = ISD::getSetCCInverse(cast(Cond.getOperand(2))->get(), Cond.getOperand(0).getValueType()); Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS); } /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); // Try simplification again because we use this function to optimize // BLENDV nodes that are not handled by the generic combiner. if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS)) return V; // When avx512 is available the lhs operand of select instruction can be // folded with mask instruction, while the rhs operand can't. Commute the // lhs and rhs of the select instruction to create the opportunity of // folding. if (SDValue V = commuteSelect(N, DAG, DL, Subtarget)) return V; EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()); // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M). // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT // can't catch, plus vXi8 cases where we'd likely end up with BLENDV. if (CondVT.isVector() && CondVT.isInteger() && CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() && (!CondConstantVector || CondVT.getScalarType() == MVT::i8) && DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits()) if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS, DL, DAG, Subtarget)) return V; // Convert vselects with constant condition into shuffles. if (CondConstantVector && DCI.isBeforeLegalizeOps() && (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) { SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond, N->getOpcode() == X86ISD::BLENDV)) return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); } // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y)) // by forcing the unselected elements to zero. // TODO: Can we handle more shuffles with this? if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() && LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB && LHS.hasOneUse() && RHS.hasOneUse()) { MVT SimpleVT = VT.getSimpleVT(); SmallVector LHSOps, RHSOps; SmallVector LHSMask, RHSMask, CondMask; if (createShuffleMaskFromVSELECT(CondMask, Cond) && getTargetShuffleMask(LHS, true, LHSOps, LHSMask) && getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) { int NumElts = VT.getVectorNumElements(); for (int i = 0; i != NumElts; ++i) { // getConstVector sets negative shuffle mask values as undef, so ensure // we hardcode SM_SentinelZero values to zero (0x80). if (CondMask[i] < NumElts) { LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i]; RHSMask[i] = 0x80; } else { LHSMask[i] = 0x80; RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i]; } } LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0), getConstVector(LHSMask, SimpleVT, DAG, DL, true)); RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0), getConstVector(RHSMask, SimpleVT, DAG, DL, true)); return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); } } // If we have SSE[12] support, try to form min/max nodes. SSE min/max // instructions match the semantics of the common C idiom x(Cond.getOperand(2))->get(); unsigned Opcode = 0; // Check for x CC y ? x : y. if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: // Converting this to a min would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMIN; break; case ISD::SETULE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); [[fallthrough]]; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMIN; break; case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETUGT: // Converting this to a max would handle NaNs incorrectly, and swapping // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETUGE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); [[fallthrough]]; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMAX; break; } // Check for x CC y ? y : x -- a min/max with reversed arms. } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETOGE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMIN; break; case ISD::SETUGE: // Converting this to a min would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); [[fallthrough]]; case ISD::SETOGT: case ISD::SETGT: case ISD::SETGE: Opcode = X86ISD::FMIN; break; case ISD::SETULT: // Converting this to a max would handle NaNs incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMAX; break; case ISD::SETOLE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETULE: // Converting this to a max would handle both negative zeros and NaNs // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); [[fallthrough]]; case ISD::SETOLT: case ISD::SETLT: case ISD::SETLE: Opcode = X86ISD::FMAX; break; } } if (Opcode) return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } // Some mask scalar intrinsics rely on checking if only one bit is set // and implement it in C code like this: // A[0] = (U & 1) ? A[0] : W[0]; // This creates some redundant instructions that break pattern matching. // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y) if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); SDValue AndNode = Cond.getOperand(0); if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ && isNullConstant(Cond.getOperand(1)) && isOneConstant(AndNode.getOperand(1))) { // LHS and RHS swapped due to // setcc outputting 1 when AND resulted in 0 and vice versa. AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8); return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS); } } // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. // The same situation all vectors of i8 and i16 without BWI. // Make sure we extend these even before type legalization gets a chance to // split wide vectors. // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); } // AVX512 - Extend select with zero to merge with target shuffle. // select(mask, extract_subvector(shuffle(x)), zero) --> // extract_subvector(select(insert_subvector(mask), shuffle(x), zero)) // TODO - support non target shuffles as well. if (Subtarget.hasAVX512() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { auto SelectableOp = [&TLI](SDValue Op) { return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && isTargetShuffle(Op.getOperand(0).getOpcode()) && isNullConstant(Op.getOperand(1)) && TLI.isTypeLegal(Op.getOperand(0).getValueType()) && Op.hasOneUse() && Op.getOperand(0).hasOneUse(); }; bool SelectableLHS = SelectableOp(LHS); bool SelectableRHS = SelectableOp(RHS); bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode()); bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode()); if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) { EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType() : RHS.getOperand(0).getValueType(); EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1); LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL, VT.getSizeInBits()); RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL, VT.getSizeInBits()); Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT, DAG.getUNDEF(SrcCondVT), Cond, DAG.getIntPtrConstant(0, DL)); SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS); return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); } } if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL)) return V; if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { EVT CondVT = Cond.getValueType(); SDValue Cond0 = Cond.getOperand(0); SDValue Cond1 = Cond.getOperand(1); ISD::CondCode CC = cast(Cond.getOperand(2))->get(); // Canonicalize min/max: // (x > 0) ? x : 0 -> (x >= 0) ? x : 0 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates // the need for an extra compare against zero. e.g. // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0 // subl %esi, %edi // testl %edi, %edi // movl $0, %eax // cmovgl %edi, %eax // => // xorl %eax, %eax // subl %esi, $edi // cmovsl %eax, %edi // // We can also canonicalize // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1 // This allows the use of a test instruction for the compare. if (LHS == Cond0 && RHS == Cond1) { if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) || (CC == ISD::SETLT && isAllOnesConstant(RHS))) { ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE; Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC); return DAG.getSelect(DL, VT, Cond, LHS, RHS); } if (CC == ISD::SETUGT && isOneConstant(RHS)) { ISD::CondCode NewCC = ISD::SETUGE; Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC); return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types. // fold eq + gt/lt nested selects into ge/le selects // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y) // --> (select (cmpuge Cond0, Cond1), LHS, Y) // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y) // --> (select (cmpsle Cond0, Cond1), LHS, Y) // .. etc .. if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS && RHS.getOperand(0).getOpcode() == ISD::SETCC) { SDValue InnerSetCC = RHS.getOperand(0); ISD::CondCode InnerCC = cast(InnerSetCC.getOperand(2))->get(); if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) && Cond0 == InnerSetCC.getOperand(0) && Cond1 == InnerSetCC.getOperand(1)) { ISD::CondCode NewCC; switch (CC == ISD::SETEQ ? InnerCC : CC) { // clang-format off case ISD::SETGT: NewCC = ISD::SETGE; break; case ISD::SETLT: NewCC = ISD::SETLE; break; case ISD::SETUGT: NewCC = ISD::SETUGE; break; case ISD::SETULT: NewCC = ISD::SETULE; break; default: NewCC = ISD::SETCC_INVALID; break; // clang-format on } if (NewCC != ISD::SETCC_INVALID) { Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC); return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2)); } } } } // Check if the first operand is all zeros and Cond type is vXi1. // If this an avx512 target we can improve the use of zero masking by // swapping the operands and inverting the condition. if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() && Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorAllZeros(LHS.getNode()) && !ISD::isBuildVectorAllZeros(RHS.getNode())) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might // get split by legalization. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST && CondVT.getVectorElementType() == MVT::i1 && TLI.isTypeLegal(VT.getScalarType())) { EVT ExtCondVT = VT.changeVectorElementTypeToInteger(); if (SDValue ExtCond = combineToExtendBoolVectorInReg( ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) { ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond); return DAG.getSelect(DL, VT, ExtCond, LHS, RHS); } } // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts // with out-of-bounds clamping. // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount // to bitwidth-1 for unsigned shifts, effectively performing a maximum left // shift of bitwidth-1 positions. and returns zero for unsigned right shifts // exceeding bitwidth-1. if (N->getOpcode() == ISD::VSELECT) { using namespace llvm::SDPatternMatch; // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt) // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt) if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) && supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) && ISD::isConstantSplatVectorAllZeros(RHS.getNode()) && sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)), m_SpecificInt(VT.getScalarSizeInBits()), m_SpecificCondCode(ISD::SETULT)))) { return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV : X86ISD::VSHLV, DL, VT, LHS.getOperand(0), LHS.getOperand(1)); } // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt) // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt) if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) && supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) && ISD::isConstantSplatVectorAllZeros(LHS.getNode()) && sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)), m_SpecificInt(VT.getScalarSizeInBits()), m_SpecificCondCode(ISD::SETUGE)))) { return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV : X86ISD::VSHLV, DL, VT, RHS.getOperand(0), RHS.getOperand(1)); } } // Early exit check if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget)) return SDValue(); if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DL, DCI, Subtarget)) return V; if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget)) return V; if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget)) return V; // select(~Cond, X, Y) -> select(Cond, Y, X) if (CondVT.getScalarType() != MVT::i1) { if (SDValue CondNot = IsNOT(Cond, DAG)) return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(CondVT, CondNot), RHS, LHS); // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the // signbit. if (Cond.getOpcode() == X86ISD::PCMPGT && ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) && Cond.hasOneUse()) { Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT, DAG.getConstant(0, DL, CondVT), Cond.getOperand(0)); return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS); } } // Try to optimize vXi1 selects if both operands are either all constants or // bitcasts from scalar integer type. In that case we can convert the operands // to integer and use an integer select which will be converted to a CMOV. // We need to take a little bit of care to avoid creating an i64 type after // type legalization. if (N->getOpcode() == ISD::SELECT && VT.isVector() && VT.getVectorElementType() == MVT::i1 && (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) { EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) { bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()); bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()); if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST && LHS.getOperand(0).getValueType() == IntVT)) && (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST && RHS.getOperand(0).getValueType() == IntVT))) { if (LHSIsConst) LHS = combinevXi1ConstantToInteger(LHS, DAG); else LHS = LHS.getOperand(0); if (RHSIsConst) RHS = combinevXi1ConstantToInteger(RHS, DAG); else RHS = RHS.getOperand(0); SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS); return DAG.getBitcast(VT, Select); } } } // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of // single bits, then invert the predicate and swap the select operands. // This can lower using a vector shift bit-hack rather than mask and compare. if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() && N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 && Cond.getOperand(0).getOpcode() == ISD::AND && isNullOrNullSplat(Cond.getOperand(1)) && cast(Cond.getOperand(2))->get() == ISD::SETEQ && Cond.getOperand(0).getValueType() == VT) { // The 'and' mask must be composed of power-of-2 constants. SDValue And = Cond.getOperand(0); auto *C = isConstOrConstSplat(And.getOperand(1)); if (C && C->getAPIntValue().isPowerOf2()) { // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS SDValue NotCond = DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE); return DAG.getSelect(DL, VT, NotCond, RHS, LHS); } // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply. // 16-bit lacks a proper blendv. unsigned EltBitWidth = VT.getScalarSizeInBits(); bool CanShiftBlend = TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) || (Subtarget.hasAVX2() && EltBitWidth == 64) || (Subtarget.hasXOP())); if (CanShiftBlend && ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) { return C->getAPIntValue().isPowerOf2(); })) { // Create a left-shift constant to get the mask bits over to the sign-bit. SDValue Mask = And.getOperand(1); SmallVector ShlVals; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { auto *MaskVal = cast(Mask.getOperand(i)); ShlVals.push_back(EltBitWidth - 1 - MaskVal->getAPIntValue().exactLogBase2()); } // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt); SDValue NewCond = DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT); return DAG.getSelect(DL, VT, NewCond, RHS, LHS); } } return SDValue(); } /// Combine: /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) /// to: /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) /// i.e., reusing the EFLAGS produced by the LOCKed instruction. /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); // Can't replace the cmp if it has more uses than the one we're looking at. // FIXME: We would like to be able to handle this, but would need to make sure // all uses were updated. if (!Cmp.hasOneUse()) return SDValue(); // This only applies to variations of the common case: // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0) // Using the proper condcodes (see below), overflow is checked for. // FIXME: We can generalize both constraints: // - XOR/OR/AND (if they were made to survive AtomicExpand) // - LHS != 1 // if the result is compared. SDValue CmpLHS = Cmp.getOperand(0); SDValue CmpRHS = Cmp.getOperand(1); EVT CmpVT = CmpLHS.getValueType(); if (!CmpLHS.hasOneUse()) return SDValue(); unsigned Opc = CmpLHS.getOpcode(); if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) return SDValue(); SDValue OpRHS = CmpLHS.getOperand(2); auto *OpRHSC = dyn_cast(OpRHS); if (!OpRHSC) return SDValue(); APInt Addend = OpRHSC->getAPIntValue(); if (Opc == ISD::ATOMIC_LOAD_SUB) Addend = -Addend; auto *CmpRHSC = dyn_cast(CmpRHS); if (!CmpRHSC) return SDValue(); APInt Comparison = CmpRHSC->getAPIntValue(); APInt NegAddend = -Addend; // See if we can adjust the CC to make the comparison match the negated // addend. if (Comparison != NegAddend) { APInt IncComparison = Comparison + 1; if (IncComparison == NegAddend) { if (CC == X86::COND_A && !Comparison.isMaxValue()) { Comparison = IncComparison; CC = X86::COND_AE; } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) { Comparison = IncComparison; CC = X86::COND_L; } } APInt DecComparison = Comparison - 1; if (DecComparison == NegAddend) { if (CC == X86::COND_AE && !Comparison.isMinValue()) { Comparison = DecComparison; CC = X86::COND_A; } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) { Comparison = DecComparison; CC = X86::COND_LE; } } } // If the addend is the negation of the comparison value, then we can do // a full comparison by emitting the atomic arithmetic as a locked sub. if (Comparison == NegAddend) { // The CC is fine, but we need to rewrite the LHS of the comparison as an // atomic sub. auto *AN = cast(CmpLHS.getNode()); auto AtomicSub = DAG.getAtomic( ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT, /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1), /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT), AN->getMemOperand()); auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT)); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); return LockOp; } // We can handle comparisons with zero in a number of cases by manipulating // the CC used. if (!Comparison.isZero()) return SDValue(); if (CC == X86::COND_S && Addend == 1) CC = X86::COND_LE; else if (CC == X86::COND_NS && Addend == 1) CC = X86::COND_G; else if (CC == X86::COND_G && Addend == -1) CC = X86::COND_GE; else if (CC == X86::COND_LE && Addend == -1) CC = X86::COND_L; else return SDValue(); SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT)); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); return LockOp; } // Check whether we're just testing the signbit, and whether we can simplify // this by tracking where the signbit came from. static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG) { if (CC != X86::COND_S && CC != X86::COND_NS) return SDValue(); if (!Cmp.hasOneUse()) return SDValue(); SDValue Src; if (Cmp.getOpcode() == X86ISD::CMP) { // CMP(X,0) -> signbit test if (!isNullConstant(Cmp.getOperand(1))) return SDValue(); Src = Cmp.getOperand(0); // Peek through a SRA node as we just need the signbit. // TODO: Remove one use limit once sdiv-fix regressions are fixed. // TODO: Use SimplifyDemandedBits instead of just SRA? if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse()) return SDValue(); Src = Src.getOperand(0); } else if (Cmp.getOpcode() == X86ISD::OR) { // OR(X,Y) -> see if only one operand contributes to the signbit. // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit. if (DAG.SignBitIsZero(Cmp.getOperand(0))) Src = Cmp.getOperand(1); else if (DAG.SignBitIsZero(Cmp.getOperand(1))) Src = Cmp.getOperand(0); else return SDValue(); } else { return SDValue(); } // Replace with a TEST on the MSB. SDLoc DL(Cmp); MVT SrcVT = Src.getSimpleValueType(); APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits()); // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then // peek through and adjust the TEST bit. if (Src.getOpcode() == ISD::SHL) { if (std::optional ShiftAmt = DAG.getValidShiftAmount(Src)) { Src = Src.getOperand(0); BitMask.lshrInPlace(*ShiftAmt); } } SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src, DAG.getConstant(BitMask, DL, SrcVT)); CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E; return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask, DAG.getConstant(0, DL, SrcVT)); } // Check whether a boolean test is testing a boolean value generated by // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition // code. // // Simplify the following patterns: // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) // to (Op EFLAGS Cond) // // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) // to (Op EFLAGS !Cond) // // where Op could be BRCOND or CMOV. // static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) return SDValue(); // Quit if not used as a boolean value. if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); // Check CMP operands. One of them should be 0 or 1 and the other should be // an SetCC or extended from it. SDValue Op1 = Cmp.getOperand(0); SDValue Op2 = Cmp.getOperand(1); SDValue SetCC; const ConstantSDNode* C = nullptr; bool needOppositeCond = (CC == X86::COND_E); bool checkAgainstTrue = false; // Is it a comparison against 1? if ((C = dyn_cast(Op1))) SetCC = Op2; else if ((C = dyn_cast(Op2))) SetCC = Op1; else // Quit if all operands are not constants. return SDValue(); if (C->getZExtValue() == 1) { needOppositeCond = !needOppositeCond; checkAgainstTrue = true; } else if (C->getZExtValue() != 0) // Quit if the constant is neither 0 or 1. return SDValue(); bool truncatedToBoolWithAnd = false; // Skip (zext $x), (trunc $x), or (and $x, 1) node. while (SetCC.getOpcode() == ISD::ZERO_EXTEND || SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx < 0) break; SetCC = SetCC.getOperand(OpIdx); truncatedToBoolWithAnd = true; } else SetCC = SetCC.getOperand(0); } switch (SetCC.getOpcode()) { case X86ISD::SETCC_CARRY: // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, // i.e. it's a comparison against true but the result of SETCC_CARRY is not // truncated to i1 using 'and'. if (checkAgainstTrue && !truncatedToBoolWithAnd) break; assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!"); [[fallthrough]]; case X86ISD::SETCC: // Set the condition code or opposite one if necessary. CC = X86::CondCode(SetCC.getConstantOperandVal(0)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(1); case X86ISD::CMOV: { // Check whether false/true value has canonical one, i.e. 0 or 1. ConstantSDNode *FVal = dyn_cast(SetCC.getOperand(0)); ConstantSDNode *TVal = dyn_cast(SetCC.getOperand(1)); // Quit if true value is not a constant. if (!TVal) return SDValue(); // Quit if false value is not a constant. if (!FVal) { SDValue Op = SetCC.getOperand(0); // Skip 'zext' or 'trunc' node. if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE) Op = Op.getOperand(0); // A special case for rdrand/rdseed, where 0 is set if false cond is // found. if ((Op.getOpcode() != X86ISD::RDRAND && Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) return SDValue(); } // Quit if false value is not the constant 0 or 1. bool FValIsFalse = true; if (FVal && FVal->getZExtValue() != 0) { if (FVal->getZExtValue() != 1) return SDValue(); // If FVal is 1, opposite cond is needed. needOppositeCond = !needOppositeCond; FValIsFalse = false; } // Quit if TVal is not the constant opposite of FVal. if (FValIsFalse && TVal->getZExtValue() != 1) return SDValue(); if (!FValIsFalse && TVal->getZExtValue() != 0) return SDValue(); CC = X86::CondCode(SetCC.getConstantOperandVal(2)); if (needOppositeCond) CC = X86::GetOppositeBranchCondition(CC); return SetCC.getOperand(3); } } return SDValue(); } /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. /// Match: /// (X86or (X86setcc) (X86setcc)) /// (X86cmp (and (X86setcc) (X86setcc)), 0) static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); } isAnd = false; SDValue SetCC0, SetCC1; switch (Cond->getOpcode()) { default: return false; case ISD::AND: case X86ISD::AND: isAnd = true; [[fallthrough]]; case ISD::OR: case X86ISD::OR: SetCC0 = Cond->getOperand(0); SetCC1 = Cond->getOperand(1); break; }; // Make sure we have SETCC nodes, using the same flags value. if (SetCC0.getOpcode() != X86ISD::SETCC || SetCC1.getOpcode() != X86ISD::SETCC || SetCC0->getOperand(1) != SetCC1->getOperand(1)) return false; CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); Flags = SetCC0->getOperand(1); return true; } // When legalizing carry, we create carries via add X, -1 // If that comes from an actual carry, via setcc, we use the // carry directly. static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { if (EFLAGS.getOpcode() == X86ISD::ADD) { if (isAllOnesConstant(EFLAGS.getOperand(1))) { bool FoundAndLSB = false; SDValue Carry = EFLAGS.getOperand(0); while (Carry.getOpcode() == ISD::TRUNCATE || Carry.getOpcode() == ISD::ZERO_EXTEND || (Carry.getOpcode() == ISD::AND && isOneConstant(Carry.getOperand(1)))) { FoundAndLSB |= Carry.getOpcode() == ISD::AND; Carry = Carry.getOperand(0); } if (Carry.getOpcode() == X86ISD::SETCC || Carry.getOpcode() == X86ISD::SETCC_CARRY) { // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? uint64_t CarryCC = Carry.getConstantOperandVal(0); SDValue CarryOp1 = Carry.getOperand(1); if (CarryCC == X86::COND_B) return CarryOp1; if (CarryCC == X86::COND_A) { // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp // instruction cannot take an immediate as its first operand. // if (CarryOp1.getOpcode() == X86ISD::SUB && CarryOp1.getNode()->hasOneUse() && CarryOp1.getValueType().isInteger() && !isa(CarryOp1.getOperand(1))) { SDValue SubCommute = DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(), CarryOp1.getOperand(1), CarryOp1.getOperand(0)); return SDValue(SubCommute.getNode(), CarryOp1.getResNo()); } } // If this is a check of the z flag of an add with 1, switch to the // C flag. if (CarryCC == X86::COND_E && CarryOp1.getOpcode() == X86ISD::ADD && isOneConstant(CarryOp1.getOperand(1))) return CarryOp1; } else if (FoundAndLSB) { SDLoc DL(Carry); SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType()); if (Carry.getOpcode() == ISD::SRL) { BitNo = Carry.getOperand(1); Carry = Carry.getOperand(0); } return getBT(Carry, BitNo, DL, DAG); } } } return SDValue(); } /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC /// to avoid the inversion. static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST. if (EFLAGS.getOpcode() != X86ISD::PTEST && EFLAGS.getOpcode() != X86ISD::TESTP) return SDValue(); // PTEST/TESTP sets EFLAGS as: // TESTZ: ZF = (Op0 & Op1) == 0 // TESTC: CF = (~Op0 & Op1) == 0 // TESTNZC: ZF == 0 && CF == 0 MVT VT = EFLAGS.getSimpleValueType(); SDValue Op0 = EFLAGS.getOperand(0); SDValue Op1 = EFLAGS.getOperand(1); MVT OpVT = Op0.getSimpleValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // TEST*(~X,Y) == TEST*(X,Y) if (SDValue NotOp0 = IsNOT(Op0, DAG)) { X86::CondCode InvCC; switch (CC) { case X86::COND_B: // testc -> testz. InvCC = X86::COND_E; break; case X86::COND_AE: // !testc -> !testz. InvCC = X86::COND_NE; break; case X86::COND_E: // testz -> testc. InvCC = X86::COND_B; break; case X86::COND_NE: // !testz -> !testc. InvCC = X86::COND_AE; break; case X86::COND_A: case X86::COND_BE: // testnzc -> testnzc (no change). InvCC = CC; break; default: InvCC = X86::COND_INVALID; break; } if (InvCC != X86::COND_INVALID) { CC = InvCC; return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, DAG.getBitcast(OpVT, NotOp0), Op1); } } if (CC == X86::COND_B || CC == X86::COND_AE) { // TESTC(X,~X) == TESTC(X,-1) if (SDValue NotOp1 = IsNOT(Op1, DAG)) { if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) { SDLoc DL(EFLAGS); return DAG.getNode( EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1), DAG.getBitcast(OpVT, DAG.getAllOnesConstant(DL, NotOp1.getValueType()))); } } } if (CC == X86::COND_E || CC == X86::COND_NE) { // TESTZ(X,~Y) == TESTC(Y,X) if (SDValue NotOp1 = IsNOT(Op1, DAG)) { CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, DAG.getBitcast(OpVT, NotOp1), Op0); } if (Op0 == Op1) { SDValue BC = peekThroughBitcasts(Op0); EVT BCVT = BC.getValueType(); // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y) if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) { return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, DAG.getBitcast(OpVT, BC.getOperand(0)), DAG.getBitcast(OpVT, BC.getOperand(1))); } // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y) if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) { CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, DAG.getBitcast(OpVT, BC.getOperand(0)), DAG.getBitcast(OpVT, BC.getOperand(1))); } // If every element is an all-sign value, see if we can use TESTP/MOVMSK // to more efficiently extract the sign bits and compare that. // TODO: Handle TESTC with comparison inversion. // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on // TESTP/MOVMSK combines to make sure its never worse than PTEST? if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) { unsigned EltBits = BCVT.getScalarSizeInBits(); if (DAG.ComputeNumSignBits(BC) == EltBits) { assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result"); APInt SignMask = APInt::getSignMask(EltBits); if (SDValue Res = TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) { // For vXi16 cases we need to use pmovmksb and extract every other // sign bit. SDLoc DL(EFLAGS); if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) { MVT FloatSVT = MVT::getFloatingPointVT(EltBits); MVT FloatVT = MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits); Res = DAG.getBitcast(FloatVT, Res); return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res); } else if (EltBits == 16) { MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; Res = DAG.getBitcast(MovmskVT, Res); Res = getPMOVMSKB(DL, Res, DAG, Subtarget); Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res, DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); } else { Res = getPMOVMSKB(DL, Res, DAG, Subtarget); } return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res, DAG.getConstant(0, DL, MVT::i32)); } } } } // TESTZ(-1,X) == TESTZ(X,X) if (ISD::isBuildVectorAllOnes(Op0.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); // TESTZ(X,-1) == TESTZ(X,X) if (ISD::isBuildVectorAllOnes(Op1.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) // TODO: Add COND_NE handling? if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) { SDValue Src0 = peekThroughBitcasts(Op0); SDValue Src1 = peekThroughBitcasts(Op1); if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) { Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)), peekThroughBitcasts(Src0.getOperand(1)), true); Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)), peekThroughBitcasts(Src1.getOperand(1)), true); if (Src0 && Src1) { MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT(); return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, DAG.getBitcast(OpVT2, Src0), DAG.getBitcast(OpVT2, Src1)); } } } } return SDValue(); } // Attempt to simplify the MOVMSK input based on the comparison type. static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Handle eq/ne against zero (any_of). // Handle eq/ne against -1 (all_of). if (!(CC == X86::COND_E || CC == X86::COND_NE)) return SDValue(); if (EFLAGS.getValueType() != MVT::i32) return SDValue(); unsigned CmpOpcode = EFLAGS.getOpcode(); if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB) return SDValue(); auto *CmpConstant = dyn_cast(EFLAGS.getOperand(1)); if (!CmpConstant) return SDValue(); const APInt &CmpVal = CmpConstant->getAPIntValue(); SDValue CmpOp = EFLAGS.getOperand(0); unsigned CmpBits = CmpOp.getValueSizeInBits(); assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch"); // Peek through any truncate. if (CmpOp.getOpcode() == ISD::TRUNCATE) CmpOp = CmpOp.getOperand(0); // Bail if we don't find a MOVMSK. if (CmpOp.getOpcode() != X86ISD::MOVMSK) return SDValue(); SDValue Vec = CmpOp.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); assert((VecVT.is128BitVector() || VecVT.is256BitVector()) && "Unexpected MOVMSK operand"); unsigned NumElts = VecVT.getVectorNumElements(); unsigned NumEltBits = VecVT.getScalarSizeInBits(); bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero(); bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) && NumElts <= CmpBits && CmpVal.isMask(NumElts); if (!IsAnyOf && !IsAllOf) return SDValue(); // TODO: Check more combining cases for me. // Here we check the cmp use number to decide do combining or not. // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))" // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint. bool IsOneUse = CmpOp.getNode()->hasOneUse(); // See if we can peek through to a vector with a wider element type, if the // signbits extend down to all the sub-elements as well. // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose // potential SimplifyDemandedBits/Elts cases. // If we looked through a truncate that discard bits, we can't do this // transform. // FIXME: We could do this transform for truncates that discarded bits by // inserting an AND mask between the new MOVMSK and the CMP. if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) { SDValue BC = peekThroughBitcasts(Vec); MVT BCVT = BC.getSimpleValueType(); unsigned BCNumElts = BCVT.getVectorNumElements(); unsigned BCNumEltBits = BCVT.getScalarSizeInBits(); if ((BCNumEltBits == 32 || BCNumEltBits == 64) && BCNumEltBits > NumEltBits && DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) { SDLoc DL(EFLAGS); APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC), DAG.getConstant(CmpMask, DL, MVT::i32)); } } // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)). // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)). // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)). // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) { SmallVector Ops; if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) && Ops.size() == 2) { SDLoc DL(EFLAGS); EVT SubVT = Ops[0].getValueType().changeTypeToInteger(); APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2); SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, DAG.getBitcast(SubVT, Ops[0]), DAG.getBitcast(SubVT, Ops[1])); V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V), DAG.getConstant(CmpMask, DL, MVT::i32)); } } // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X). // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X). // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)). // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)). if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) { MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; SDValue BC = peekThroughBitcasts(Vec); // Ensure MOVMSK was testing every signbit of BC. if (BC.getValueType().getVectorNumElements() <= NumElts) { if (BC.getOpcode() == X86ISD::PCMPEQ) { SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(), BC.getOperand(0), BC.getOperand(1)); V = DAG.getBitcast(TestVT, V); return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); } // Check for 256-bit split vector cases. if (BC.getOpcode() == ISD::AND && BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ && BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) { SDValue LHS = BC.getOperand(0); SDValue RHS = BC.getOperand(1); LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(), LHS.getOperand(0), LHS.getOperand(1)); RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(), RHS.getOperand(0), RHS.getOperand(1)); LHS = DAG.getBitcast(TestVT, LHS); RHS = DAG.getBitcast(TestVT, RHS); SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS); return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); } } } // See if we can avoid a PACKSS by calling MOVMSK on the sources. // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out // sign bits prior to the comparison with zero unless we know that // the vXi16 splats the sign bit down to the lower i8 half. // TODO: Handle all_of patterns. if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) { SDValue VecOp0 = Vec.getOperand(0); SDValue VecOp1 = Vec.getOperand(1); bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8; bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8; // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA. if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) { SDLoc DL(EFLAGS); SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0); Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16); if (!SignExt0) { Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result, DAG.getConstant(0xAAAA, DL, MVT::i16)); } return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, DAG.getConstant(0, DL, MVT::i16)); } // PMOVMSKB(PACKSSBW(LO(X), HI(X))) // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA. if (CmpBits >= 16 && Subtarget.hasInt256() && (IsAnyOf || (SignExt0 && SignExt1))) { if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) { SDLoc DL(EFLAGS); SDValue Result = peekThroughBitcasts(Src); if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ && Result.getValueType().getVectorNumElements() <= NumElts) { SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(), Result.getOperand(0), Result.getOperand(1)); V = DAG.getBitcast(MVT::v4i64, V); return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); } Result = DAG.getBitcast(MVT::v32i8, Result); Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF; if (!SignExt0 || !SignExt1) { assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"); Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); } return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, DAG.getConstant(CmpMask, DL, MVT::i32)); } } } // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced. // Since we peek through a bitcast, we need to be careful if the base vector // type has smaller elements than the MOVMSK type. In that case, even if // all the elements are demanded by the shuffle mask, only the "high" // elements which have highbits that align with highbits in the MOVMSK vec // elements are actually demanded. A simplification of spurious operations // on the "low" elements take place during other simplifications. // // For example: // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are // demanded, because we are swapping around the result can change. // // To address this, we check that we can scale the shuffle mask to MOVMSK // element width (this will ensure "high" elements match). Its slightly overly // conservative, but fine for an edge case fold. SmallVector ShuffleMask; SmallVector ShuffleInputs; if (NumElts <= CmpBits && getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs, ShuffleMask, DAG) && ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) && ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() && canScaleShuffleElements(ShuffleMask, NumElts)) { SDLoc DL(EFLAGS); SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]); Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); Result = DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType()); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1)); } // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V) // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V) // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V) // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V) // iff every element is referenced. if (NumElts <= CmpBits && Subtarget.hasAVX() && !Subtarget.preferMovmskOverVTest() && IsOneUse && (NumEltBits == 32 || NumEltBits == 64)) { SDLoc DL(EFLAGS); MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits); MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts); MVT IntVT = FloatVT.changeVectorElementTypeToInteger(); SDValue LHS = Vec; SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT); CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); return DAG.getNode(X86ISD::TESTP, DL, MVT::i32, DAG.getBitcast(FloatVT, LHS), DAG.getBitcast(FloatVT, RHS)); } return SDValue(); } /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (CC == X86::COND_B) if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG)) return Flags; if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG)) return R; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget)) return R; if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget)) return R; return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); SDValue Cond = N->getOperand(3); // cmov X, X, ?, ? --> X if (TrueOp == FalseOp) return TrueOp; // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (!(FalseOp.getValueType() == MVT::f80 || (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) || (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) || !Subtarget.canUseCMOV() || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8), Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. if (ConstantSDNode *TrueC = dyn_cast(TrueOp)) { if (ConstantSDNode *FalseC = dyn_cast(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, DAG.getConstant(ShAmt, DL, MVT::i8)); return Cond; } // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { Cond = getSETCC(CC, Cond, DL, DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && "Implicit constant truncation"); bool isFastMultiplier = false; if (Diff.ult(10)) { switch (Diff.getZExtValue()) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) case 3: // result = lea base(cond, cond*2) case 4: // result = lea base( , cond*4) case 5: // result = lea base(cond, cond*4) case 8: // result = lea base( , cond*8) case 9: // result = lea base(cond, cond*8) isFastMultiplier = true; break; } } if (isFastMultiplier) { Cond = getSETCC(CC, Cond, DL ,DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); // Scale the condition by the difference. if (Diff != 1) Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, DAG.getConstant(Diff, DL, Cond.getValueType())); // Add the base if non-zero. if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); return Cond; } } } } // Handle these cases: // (select (x != c), e, c) -> select (x != c), e, x), // (select (x == c), c, e) -> select (x == c), x, e) // where the c is an integer constant, and the "select" is the combination // of CMOV and CMP. // // The rationale for this change is that the conditional-move from a constant // needs two instructions, however, conditional-move from a register needs // only one instruction. // // CAVEAT: By replacing a constant with a symbolic value, it may obscure // some instruction-combining opportunities. This opt needs to be // postponed as late as possible. // if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { // the DCI.xxxx conditions are provided to postpone the optimization as // late as possible. ConstantSDNode *CmpAgainst = nullptr; if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && (CmpAgainst = dyn_cast(Cond.getOperand(1))) && !isa(Cond.getOperand(0))) { if (CC == X86::COND_NE && CmpAgainst == dyn_cast(FalseOp)) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueOp, FalseOp); } if (CC == X86::COND_E && CmpAgainst == dyn_cast(TrueOp)) { SDValue Ops[] = {FalseOp, Cond.getOperand(0), DAG.getTargetConstant(CC, DL, MVT::i8), Cond}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } } // Transform: // // (cmov 1 T (uge T 2)) // // to: // // (adc T 0 (sub T 1)) if (CC == X86::COND_AE && isOneConstant(FalseOp) && Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) { SDValue Cond0 = Cond.getOperand(0); if (Cond0.getOpcode() == ISD::TRUNCATE) Cond0 = Cond0.getOperand(0); auto *Sub1C = dyn_cast(Cond.getOperand(1)); if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) { EVT CondVT = Cond->getValueType(0); EVT OuterVT = N->getValueType(0); // Subtract 1 and generate a carry. SDValue NewSub = DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0), DAG.getConstant(1, DL, CondVT)); SDValue EFLAGS(NewSub.getNode(), 1); return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32), TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS); } } // Fold and/or of setcc's to double CMOV: // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) // // This combine lets us generate: // cmovcc1 (jcc1 if we don't have CMOV) // cmovcc2 (same) // instead of: // setcc1 // setcc2 // and/or // cmovne (jne if we don't have CMOV) // When we can't use the CMOV instruction, it might increase branch // mispredicts. // When we can use CMOV, or when there is no mispredict, this improves // throughput and reduces register pressure. // if (CC == X86::COND_NE) { SDValue Flags; X86::CondCode CC0, CC1; bool isAndSetCC; if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { if (isAndSetCC) { std::swap(FalseOp, TrueOp); CC0 = X86::GetOppositeBranchCondition(CC0); CC1 = X86::GetOppositeBranchCondition(CC1); } SDValue LOps[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC0, DL, MVT::i8), Flags}; SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8), Flags}; SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); return CMOV; } } // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) -> // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) if ((CC == X86::COND_NE || CC == X86::COND_E) && Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { SDValue Add = TrueOp; SDValue Const = FalseOp; // Canonicalize the condition code for easier matching and output. if (CC == X86::COND_E) std::swap(Add, Const); // We might have replaced the constant in the cmov with the LHS of the // compare. If so change it to the RHS of the compare. if (Const == Cond.getOperand(0)) Const = Cond.getOperand(1); // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant. if (isa(Const) && Add.getOpcode() == ISD::ADD && Add.hasOneUse() && isa(Add.getOperand(1)) && (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF || Add.getOperand(0).getOpcode() == ISD::CTTZ) && Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) { EVT VT = N->getValueType(0); // This should constant fold. SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond); return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); } } return SDValue(); } /// Different mul shrinking modes. enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { EVT VT = N->getOperand(0).getValueType(); if (VT.getScalarSizeInBits() != 32) return false; assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2"); unsigned SignBits[2] = {1, 1}; bool IsPositive[2] = {false, false}; for (unsigned i = 0; i < 2; i++) { SDValue Opd = N->getOperand(i); SignBits[i] = DAG.ComputeNumSignBits(Opd); IsPositive[i] = DAG.SignBitIsZero(Opd); } bool AllPositive = IsPositive[0] && IsPositive[1]; unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); // When ranges are from -128 ~ 127, use MULS8 mode. if (MinSignBits >= 25) Mode = ShrinkMode::MULS8; // When ranges are from 0 ~ 255, use MULU8 mode. else if (AllPositive && MinSignBits >= 24) Mode = ShrinkMode::MULU8; // When ranges are from -32768 ~ 32767, use MULS16 mode. else if (MinSignBits >= 17) Mode = ShrinkMode::MULS16; // When ranges are from 0 ~ 65535, use MULU16 mode. else if (AllPositive && MinSignBits >= 16) Mode = ShrinkMode::MULU16; else return false; return true; } /// When the operands of vector mul are extended from smaller size values, /// like i8 and i16, the type of mul may be shrinked to generate more /// efficient code. Two typical patterns are handled: /// Pattern1: /// %2 = sext/zext %1 to /// %4 = sext/zext %3 to // or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) /// %5 = mul %2, %4 /// /// Pattern2: /// %2 = zext/sext %1 to /// %4 = zext/sext %3 to /// or %4 = build_vector %C1, ..., %CN (%C1..%CN are constants) /// %5 = mul %2, %4 /// /// There are four mul shrinking modes: /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is /// -128 to 128, and the scalar value range of %4 is also -128 to 128, /// generate pmullw+sext32 for it (MULS8 mode). /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is /// 0 to 255, and the scalar value range of %4 is also 0 to 255, /// generate pmullw+zext32 for it (MULU8 mode). /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, /// generate pmullw+pmulhw for it (MULS16 mode). /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, /// generate pmullw+pmulhuw for it (MULU16 mode). static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Check for legality // pmullw/pmulhw are not supported by SSE. if (!Subtarget.hasSSE2()) return SDValue(); // Check for profitability // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); ShrinkMode Mode; if (!canReduceVMulWidth(N, DAG, Mode)) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getOperand(0).getValueType(); unsigned NumElts = VT.getVectorNumElements(); if ((NumElts % 2) != 0) return SDValue(); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8) return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT, MulLo); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. SDValue MulHi = DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL, ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. // Generate shuffle functioning as punpcklwd. SmallVector ShuffleMask(NumElts); for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i; ShuffleMask[2 * i + 1] = i + NumElts; } SDValue ResLo = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); ResLo = DAG.getBitcast(ResVT, ResLo); // Generate shuffle functioning as punpckhwd. for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i + NumElts / 2; ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; } SDValue ResHi = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); ResHi = DAG.getBitcast(ResVT, ResHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL) { auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(Mult, DL, VT)); Result = DAG.getNode(ISD::SHL, DL, VT, Result, DAG.getConstant(Shift, DL, MVT::i8)); Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, N->getOperand(0)); return Result; }; auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) { SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(Mul1, DL, VT)); Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result, DAG.getConstant(Mul2, DL, VT)); Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, N->getOperand(0)); return Result; }; switch (MulAmt) { default: break; case 11: // mul x, 11 => add ((shl (mul x, 5), 1), x) return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); case 21: // mul x, 21 => add ((shl (mul x, 5), 2), x) return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); case 41: // mul x, 41 => add ((shl (mul x, 5), 3), x) return combineMulShlAddOrSub(5, 3, /*isAdd*/ true); case 22: // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); case 19: // mul x, 19 => add ((shl (mul x, 9), 1), x) return combineMulShlAddOrSub(9, 1, /*isAdd*/ true); case 37: // mul x, 37 => add ((shl (mul x, 9), 2), x) return combineMulShlAddOrSub(9, 2, /*isAdd*/ true); case 73: // mul x, 73 => add ((shl (mul x, 9), 3), x) return combineMulShlAddOrSub(9, 3, /*isAdd*/ true); case 13: // mul x, 13 => add ((shl (mul x, 3), 2), x) return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); case 23: // mul x, 23 => sub ((shl (mul x, 3), 3), x) return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); case 26: // mul x, 26 => add ((mul (mul x, 5), 5), x) return combineMulMulAddOrSub(5, 5, /*isAdd*/ true); case 28: // mul x, 28 => add ((mul (mul x, 9), 3), x) return combineMulMulAddOrSub(9, 3, /*isAdd*/ true); case 29: // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), combineMulMulAddOrSub(9, 3, /*isAdd*/ true)); } // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed // by a single LEA. // First check if this a sum of two power of 2s because that's easy. Then // count how many zeros are up to the first bit. // TODO: We can do this even without LEA at a cost of two shifts and an add. if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { unsigned ScaleShift = llvm::countr_zero(MulAmt); if (ScaleShift >= 1 && ScaleShift < 4) { unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(ShiftAmt, DL, MVT::i8)); SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(ScaleShift, DL, MVT::i8)); return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2); } } return SDValue(); } // If the upper 17 bits of either element are zero and the other element are // zero/sign bits then we can use PMADDWD, which is always at least as quick as // PMULLD, except on KNL. static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); if (Subtarget.isPMADDWDSlow()) return SDValue(); EVT VT = N->getValueType(0); // Only support vXi32 vectors. if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) return SDValue(); // Make sure the type is legal or can split/widen to a legal type. // With AVX512 but without BWI, we would need to split v32i16. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 1 || !isPowerOf2_32(NumElts)) return SDValue(); // With AVX512 but without BWI, we would need to split v32i16. if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // If we are zero/sign extending two steps without SSE4.1, its better to // reduce the vmul width instead. if (!Subtarget.hasSSE41() && (((N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getScalarValueSizeInBits() <= 8) && (N1.getOpcode() == ISD::ZERO_EXTEND && N1.getOperand(0).getScalarValueSizeInBits() <= 8)) || ((N0.getOpcode() == ISD::SIGN_EXTEND && N0.getOperand(0).getScalarValueSizeInBits() <= 8) && (N1.getOpcode() == ISD::SIGN_EXTEND && N1.getOperand(0).getScalarValueSizeInBits() <= 8)))) return SDValue(); // If we are sign extending a wide vector without SSE4.1, its better to reduce // the vmul width instead. if (!Subtarget.hasSSE41() && (N0.getOpcode() == ISD::SIGN_EXTEND && N0.getOperand(0).getValueSizeInBits() > 128) && (N1.getOpcode() == ISD::SIGN_EXTEND && N1.getOperand(0).getValueSizeInBits() > 128)) return SDValue(); // Sign bits must extend down to the lowest i16. if (DAG.ComputeMaxSignificantBits(N1) > 16 || DAG.ComputeMaxSignificantBits(N0) > 16) return SDValue(); // At least one of the elements must be zero in the upper 17 bits, or can be // safely made zero without altering the final result. auto GetZeroableOp = [&](SDValue Op) { APInt Mask17 = APInt::getHighBitsSet(32, 17); if (DAG.MaskedValueIsZero(Op, Mask17)) return Op; // Mask off upper 16-bits of sign-extended constants. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT)); if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) { SDValue Src = Op.getOperand(0); // Convert sext(vXi16) to zext(vXi16). if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128) return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src); // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets // which will expand the extension. if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) { EVT ExtVT = VT.changeVectorElementType(MVT::i16); Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src); return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src); } } // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG. if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && N->isOnlyUserOf(Op.getNode())) { SDValue Src = Op.getOperand(0); if (Src.getScalarValueSizeInBits() == 16) return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src); } // Convert VSRAI(Op, 16) to VSRLI(Op, 16). if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 && N->isOnlyUserOf(Op.getNode())) { return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0), Op.getOperand(1)); } return SDValue(); }; SDValue ZeroN0 = GetZeroableOp(N0); SDValue ZeroN1 = GetZeroableOp(N1); if (!ZeroN0 && !ZeroN1) return SDValue(); N0 = ZeroN0 ? ZeroN0 : N0; N1 = ZeroN1 ? ZeroN1 : N1; // Use SplitOpsAndApply to handle AVX splitting. auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, DAG.getBitcast(OpVT, Ops[0]), DAG.getBitcast(OpVT, Ops[1])); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder); } static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); EVT VT = N->getValueType(0); // Only support vXi64 vectors. if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 || VT.getVectorNumElements() < 2 || !isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // MULDQ returns the 64-bit result of the signed multiplication of the lower // 32-bits. We can lower with this if the sign bits stretch that far. if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 && DAG.ComputeNumSignBits(N1) > 32) { auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder, /*CheckBWI*/ false); } // If the upper bits are zero we can use a single pmuludq. APInt Mask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) { auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder, /*CheckBWI*/ false); } return SDValue(); } static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDLoc DL(N); if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget)) return V; if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget)) return V; if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DL, DAG, Subtarget); // Optimize a single multiply with constant into two operations in order to // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. if (DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); if (VT != MVT::i64 && VT != MVT::i32 && (!VT.isVector() || !VT.isSimple() || !VT.isInteger())) return SDValue(); ConstantSDNode *CNode = isConstOrConstSplat( N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false); const APInt *C = nullptr; if (!CNode) { if (VT.isVector()) if (auto *RawC = getTargetConstantFromNode(N->getOperand(1))) if (auto *SplatC = RawC->getSplatValue()) if (auto *SplatCI = dyn_cast(SplatC)) C = &(SplatCI->getValue()); if (!C || C->getBitWidth() != VT.getScalarSizeInBits()) return SDValue(); } else { C = &(CNode->getAPIntValue()); } if (isPowerOf2_64(C->getZExtValue())) return SDValue(); int64_t SignMulAmt = C->getSExtValue(); assert(SignMulAmt != INT64_MIN && "Int min should have been handled!"); uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; SDValue NewMul = SDValue(); if (VT == MVT::i64 || VT == MVT::i32) { if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(AbsMulAmt, DL, VT)); if (SignMulAmt < 0) NewMul = DAG.getNegative(NewMul, DL, VT); return NewMul; } uint64_t MulAmt1 = 0; uint64_t MulAmt2 = 0; if ((AbsMulAmt % 9) == 0) { MulAmt1 = 9; MulAmt2 = AbsMulAmt / 9; } else if ((AbsMulAmt % 5) == 0) { MulAmt1 = 5; MulAmt2 = AbsMulAmt / 5; } else if ((AbsMulAmt % 3) == 0) { MulAmt1 = 3; MulAmt2 = AbsMulAmt / 3; } // For negative multiply amounts, only allow MulAmt2 to be a power of 2. if (MulAmt2 && (isPowerOf2_64(MulAmt2) || (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) // If second multiplifer is pow2, issue it first. We want the multiply // by 3, 5, or 9 to be folded into the addressing mode unless the lone // use is an add. Only do this for positive multiply amounts since the // negate would prevent it from being used as an address mode anyway. std::swap(MulAmt1, MulAmt2); if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), DAG.getConstant(MulAmt1, DL, VT)); if (isPowerOf2_64(MulAmt2)) NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); // Negate the result. if (SignMulAmt < 0) NewMul = DAG.getNegative(NewMul, DL, VT); } else if (!Subtarget.slowLEA()) NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); } if (!NewMul) { EVT ShiftVT = VT.isVector() ? VT : MVT::i8; assert(C->getZExtValue() != 0 && C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) && "Both cases that could cause potential overflows should have " "already been handled."); if (isPowerOf2_64(AbsMulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) NewMul = DAG.getNode( ISD::ADD, DL, VT, N->getOperand(0), DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT))); if (SignMulAmt < 0) NewMul = DAG.getNegative(NewMul, DL, VT); } else if (isPowerOf2_64(AbsMulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT)); // To negate, reverse the operands of the subtract. if (SignMulAmt < 0) NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul); else NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0)); } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) && (!VT.isVector() || Subtarget.fastImmVectorShift())) { // (mul x, 2^N + 2) => (add (shl x, N), (add x, x)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT)); NewMul = DAG.getNode( ISD::ADD, DL, VT, NewMul, DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0))); } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) && (!VT.isVector() || Subtarget.fastImmVectorShift())) { // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT)); NewMul = DAG.getNode( ISD::SUB, DL, VT, NewMul, DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0))); } else if (SignMulAmt >= 0 && VT.isVector() && Subtarget.fastImmVectorShift()) { uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt); uint64_t ShiftAmt1; std::optional Opc; if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) { ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit; Opc = ISD::ADD; } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) { ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit; Opc = ISD::SUB; } if (Opc) { SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT)); SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT)); NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2); } } } return NewMul; } // Try to form a MULHU or MULHS node by looking for // (srl (mul ext, ext), 16) // TODO: This is X86 specific because we want to be able to handle wide types // before type legalization. But we can only do it if the vector will be // legalized via widening/splitting. Type legalization can't handle promotion // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG // combiner. static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"); if (!Subtarget.hasSSE2()) return SDValue(); // The operation feeding into the shift must be a multiply. SDValue ShiftOperand = N->getOperand(0); if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse()) return SDValue(); // Input type should be at least vXi32. EVT VT = N->getValueType(0); if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32) return SDValue(); // Need a shift by 16. APInt ShiftAmt; if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) || ShiftAmt != 16) return SDValue(); SDValue LHS = ShiftOperand.getOperand(0); SDValue RHS = ShiftOperand.getOperand(1); unsigned ExtOpc = LHS.getOpcode(); if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) || RHS.getOpcode() != ExtOpc) return SDValue(); // Peek through the extends. LHS = LHS.getOperand(0); RHS = RHS.getOperand(0); // Ensure the input types match. EVT MulVT = LHS.getValueType(); if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT) return SDValue(); unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU; SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS); ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, DL, VT, Mulh); } static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { using namespace llvm::SDPatternMatch; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *N1C = dyn_cast(N1); EVT VT = N0.getValueType(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDLoc DL(N); // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts // with out-of-bounds clamping. if (N0.getOpcode() == ISD::VSELECT && supportedVectorVarShift(VT, Subtarget, ISD::SHL)) { SDValue Cond = N0.getOperand(0); SDValue N00 = N0.getOperand(1); SDValue N01 = N0.getOperand(2); // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt) if (ISD::isConstantSplatVectorAllZeros(N01.getNode()) && sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), m_SpecificCondCode(ISD::SETULT)))) { return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1); } // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt) if (ISD::isConstantSplatVectorAllZeros(N00.getNode()) && sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), m_SpecificCondCode(ISD::SETUGE)))) { return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1); } } // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); APInt Mask = N0.getConstantOperandAPInt(1); Mask <<= N1C->getAPIntValue(); bool MaskOK = false; // We can handle cases concerning bit-widening nodes containing setcc_c if // we carefully interrogate the mask to make sure we are semantics // preserving. // The transform is not safe if the result of C1 << C2 exceeds the bitwidth // of the underlying setcc_c operation if the setcc_c was zero extended. // Consider the following example: // zext(setcc_c) -> i32 0x0000FFFF // c1 -> i32 0x0000FFFF // c2 -> i32 0x00000001 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE if (N00.getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if (N00.getOpcode() == ISD::SIGN_EXTEND && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = true; } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || N00.getOpcode() == ISD::ANY_EXTEND) && N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); } if (MaskOK && Mask != 0) return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } return SDValue(); } static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { using namespace llvm::SDPatternMatch; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned Size = VT.getSizeInBits(); SDLoc DL(N); if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget)) return V; // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt) if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) { SDValue ShrAmtVal; if (sd_match(N1, m_UMin(m_Value(ShrAmtVal), m_SpecificInt(VT.getScalarSizeInBits() - 1)))) return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal); } // fold (SRA (SHL X, ShlConst), SraConst) // into (SHL (sext_in_reg X), ShlConst - SraConst) // or (sext_in_reg X) // or (SRA (sext_in_reg X), SraConst - ShlConst) // depending on relation between SraConst and ShlConst. // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows // us to do the sext_in_reg from corresponding bit. // sexts in X86 are MOVs. The MOVs have the same code size // as above SHIFTs (only SHIFT on 1 has lower code size). // However the MOVs have 2 advantages to a SHIFT: // 1. MOVs can write to a register that differs from source // 2. MOVs accept memory operands if (VT.isVector() || N1.getOpcode() != ISD::Constant || N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || N0.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); APInt ShlConst = N01->getAsAPIntVal(); APInt SraConst = N1->getAsAPIntVal(); EVT CVT = N1.getValueType(); if (CVT != N01.getValueType()) return SDValue(); if (SraConst.isNegative()) return SDValue(); for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { unsigned ShiftSize = SVT.getSizeInBits(); // Only deal with (Size - ShlConst) being equal to 8, 16 or 32. if (ShiftSize >= Size || ShlConst != Size - ShiftSize) continue; SDValue NN = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); if (SraConst.eq(ShlConst)) return NN; if (SraConst.ult(ShlConst)) return DAG.getNode(ISD::SHL, DL, VT, NN, DAG.getConstant(ShlConst - SraConst, DL, CVT)); return DAG.getNode(ISD::SRA, DL, VT, NN, DAG.getConstant(SraConst - ShlConst, DL, CVT)); } return SDValue(); } static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { using namespace llvm::SDPatternMatch; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDLoc DL(N); if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget)) return V; // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts // with out-of-bounds clamping. if (N0.getOpcode() == ISD::VSELECT && supportedVectorVarShift(VT, Subtarget, ISD::SRL)) { SDValue Cond = N0.getOperand(0); SDValue N00 = N0.getOperand(1); SDValue N01 = N0.getOperand(2); // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt) if (ISD::isConstantSplatVectorAllZeros(N01.getNode()) && sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), m_SpecificCondCode(ISD::SETULT)))) { return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1); } // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt) if (ISD::isConstantSplatVectorAllZeros(N00.getNode()) && sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits), m_SpecificCondCode(ISD::SETUGE)))) { return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1); } } // Only do this on the last DAG combine as it can interfere with other // combines. if (!DCI.isAfterLegalizeDAG()) return SDValue(); // Try to improve a sequence of srl (and X, C1), C2 by inverting the order. // TODO: This is a generic DAG combine that became an x86-only combine to // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and // and-not ('andn'). if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); auto *ShiftC = dyn_cast(N1); auto *AndC = dyn_cast(N0.getOperand(1)); if (!ShiftC || !AndC) return SDValue(); // If we can shrink the constant mask below 8-bits or 32-bits, then this // transform should reduce code size. It may also enable secondary transforms // from improved known-bits analysis or instruction selection. APInt MaskVal = AndC->getAPIntValue(); // If this can be matched by a zero extend, don't optimize. if (MaskVal.isMask()) { unsigned TO = MaskVal.countr_one(); if (TO >= 8 && isPowerOf2_32(TO)) return SDValue(); } APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue()); unsigned OldMaskSize = MaskVal.getSignificantBits(); unsigned NewMaskSize = NewMaskVal.getSignificantBits(); if ((OldMaskSize > 8 && NewMaskSize <= 8) || (OldMaskSize > 32 && NewMaskSize <= 32)) { // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC) SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT); SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask); } return SDValue(); } static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"); SDLoc DL(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT SrcVT = N0.getValueType(); SDValue BC0 = N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0; SDValue BC1 = N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1; // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for // truncation trees that help us avoid lane crossing shuffles. // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. // TODO: We don't handle vXf64 shuffles yet. if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) { SmallVector ShuffleOps; SmallVector ShuffleMask, ScaledMask; SDValue Vec = peekThroughBitcasts(BCSrc); if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) { resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask); // To keep the HOP LHS/RHS coherency, we must be able to scale the unary // shuffle to a v4X64 width - we can probably relax this in the future. if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 && ShuffleOps[0].getValueType().is256BitVector() && scaleShuffleElements(ShuffleMask, 4, ScaledMask)) { SDValue Lo, Hi; MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL); Lo = DAG.getBitcast(SrcVT, Lo); Hi = DAG.getBitcast(SrcVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); Res = DAG.getBitcast(ShufVT, Res); Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask); return DAG.getBitcast(VT, Res); } } } } // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()). if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { // If either/both ops are a shuffle that can scale to v2x64, // then see if we can perform this as a v4x32 post shuffle. SmallVector Ops0, Ops1; SmallVector Mask0, Mask1, ScaledMask0, ScaledMask1; bool IsShuf0 = getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && scaleShuffleElements(Mask0, 2, ScaledMask0) && all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; }); bool IsShuf1 = getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && scaleShuffleElements(Mask1, 2, ScaledMask1) && all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; }); if (IsShuf0 || IsShuf1) { if (!IsShuf0) { Ops0.assign({BC0}); ScaledMask0.assign({0, 1}); } if (!IsShuf1) { Ops1.assign({BC1}); ScaledMask1.assign({0, 1}); } SDValue LHS, RHS; int PostShuffle[4] = {-1, -1, -1, -1}; auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef Ops) { if (M < 0) return true; Idx = M % 2; SDValue Src = Ops[M / 2]; if (!LHS || LHS == Src) { LHS = Src; return true; } if (!RHS || RHS == Src) { Idx += 2; RHS = Src; return true; } return false; }; if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) && FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) && FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) && FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) { LHS = DAG.getBitcast(SrcVT, LHS); RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS); MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); Res = DAG.getBitcast(ShufVT, Res); Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle); return DAG.getBitcast(VT, Res); } } } // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)). if (VT.is256BitVector() && Subtarget.hasInt256()) { SmallVector Mask0, Mask1; SmallVector Ops0, Ops1; SmallVector ScaledMask0, ScaledMask1; if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && !Ops0.empty() && !Ops1.empty() && all_of(Ops0, [](SDValue Op) { return Op.getValueType().is256BitVector(); }) && all_of(Ops1, [](SDValue Op) { return Op.getValueType().is256BitVector(); }) && scaleShuffleElements(Mask0, 2, ScaledMask0) && scaleShuffleElements(Mask1, 2, ScaledMask1)) { SDValue Op00 = peekThroughBitcasts(Ops0.front()); SDValue Op10 = peekThroughBitcasts(Ops1.front()); SDValue Op01 = peekThroughBitcasts(Ops0.back()); SDValue Op11 = peekThroughBitcasts(Ops1.back()); if ((Op00 == Op11) && (Op01 == Op10)) { std::swap(Op10, Op11); ShuffleVectorSDNode::commuteMask(ScaledMask1); } if ((Op00 == Op10) && (Op01 == Op11)) { const int Map[4] = {0, 2, 1, 3}; SmallVector ShuffleMask( {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]], Map[ScaledMask1[1]]}); MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00), DAG.getBitcast(SrcVT, Op01)); Res = DAG.getBitcast(ShufVT, Res); Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); return DAG.getBitcast(VT, Res); } } } return SDValue(); } static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && "Unexpected pack opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned NumDstElts = VT.getVectorNumElements(); unsigned DstBitsPerElt = VT.getScalarSizeInBits(); unsigned SrcBitsPerElt = 2 * DstBitsPerElt; assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"); bool IsSigned = (X86ISD::PACKSS == Opcode); // Constant Folding. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) && (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) && getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) { unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumSrcElts = NumDstElts / 2; unsigned NumDstEltsPerLane = NumDstElts / NumLanes; unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; APInt Undefs(NumDstElts, 0); SmallVector Bits(NumDstElts, APInt::getZero(DstBitsPerElt)); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0); auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0); if (UndefElts[SrcIdx]) { Undefs.setBit(Lane * NumDstEltsPerLane + Elt); continue; } APInt &Val = EltBits[SrcIdx]; if (IsSigned) { // PACKSS: Truncate signed value with signed saturation. // Source values less than dst minint are saturated to minint. // Source values greater than dst maxint are saturated to maxint. Val = Val.truncSSat(DstBitsPerElt); } else { // PACKUS: Truncate signed value with unsigned saturation. // Source values less than zero are saturated to zero. // Source values greater than dst maxuint are saturated to maxuint. // NOTE: This is different from APInt::truncUSat. if (Val.isIntN(DstBitsPerElt)) Val = Val.trunc(DstBitsPerElt); else if (Val.isNegative()) Val = APInt::getZero(DstBitsPerElt); else Val = APInt::getAllOnes(DstBitsPerElt); } Bits[Lane * NumDstEltsPerLane + Elt] = Val; } } return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); } // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()). if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) return V; // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)). // Currently limit this to allsignbits cases only. if (IsSigned && (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) && (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) { SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG); SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG); if (Not0 && Not1) { SDLoc DL(N); MVT SrcVT = N0.getSimpleValueType(); SDValue Pack = DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0), DAG.getBitcast(SrcVT, Not1)); return DAG.getNOT(DL, Pack, VT); } } // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular // truncate to create a larger truncate. if (Subtarget.hasAVX512() && N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 && N0.getOperand(0).getValueType() == MVT::v8i32) { if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) || (!IsSigned && DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) { if (Subtarget.hasVLX()) return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0)); // Widen input to v16i32 so we can truncate that. SDLoc dl(N); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32, N0.getOperand(0), DAG.getUNDEF(MVT::v8i32)); return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat); } } // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors. if (VT.is128BitVector()) { unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue Src0, Src1; if (N0.getOpcode() == ExtOpc && N0.getOperand(0).getValueType().is64BitVector() && N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { Src0 = N0.getOperand(0); } if (N1.getOpcode() == ExtOpc && N1.getOperand(0).getValueType().is64BitVector() && N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { Src1 = N1.getOperand(0); } if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) { assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"); Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType()); Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType()); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1); } // Try again with pack(*_extend_vector_inreg, undef). unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG : ISD::ZERO_EXTEND_VECTOR_INREG; if (N0.getOpcode() == VecInRegOpc && N1.isUndef() && N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt) return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0), DAG); } // Attempt to combine as shuffle. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; return SDValue(); } static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && "Unexpected horizontal add/sub opcode"); if (!shouldUseHorizontalOp(true, DAG, Subtarget)) { MVT VT = N->getSimpleValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)). if (LHS != RHS && LHS.getOpcode() == N->getOpcode() && LHS.getOpcode() == RHS.getOpcode() && LHS.getValueType() == RHS.getValueType() && N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) { SDValue LHS0 = LHS.getOperand(0); SDValue LHS1 = LHS.getOperand(1); SDValue RHS0 = RHS.getOperand(0); SDValue RHS1 = RHS.getOperand(1); if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) && (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) { SDLoc DL(N); SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(), LHS0.isUndef() ? LHS1 : LHS0, RHS0.isUndef() ? RHS1 : RHS0); MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); Res = DAG.getBitcast(ShufVT, Res); SDValue NewLHS = DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res, getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG)); SDValue NewRHS = DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res, getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG)); return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS), DAG.getBitcast(VT, NewRHS)); } } } // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()). if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) return V; return SDValue(); } static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Shift zero -> zero. if (ISD::isBuildVectorAllZeros(N0.getNode())) return DAG.getConstant(0, SDLoc(N), VT); // Detect constant shift amounts. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)) { unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false); return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0, EltBits[0].getZExtValue(), DAG); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"); bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type"); assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type"); // (shift undef, X) -> 0 if (N0.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. unsigned ShiftVal = N->getConstantOperandVal(1); if (ShiftVal >= NumBitsPerElt) { if (LogicalShift) return DAG.getConstant(0, SDLoc(N), VT); ShiftVal = NumBitsPerElt - 1; } // (shift X, 0) -> X if (!ShiftVal) return N0; // (shift 0, C) -> 0 if (ISD::isBuildVectorAllZeros(N0.getNode())) // N0 is all zeros or undef. We guarantee that the bits shifted into the // result are all zeros, not undef. return DAG.getConstant(0, SDLoc(N), VT); // (VSRAI -1, C) -> -1 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode())) // N0 is all ones or undef. We guarantee that the bits shifted into the // result are all ones, not undef. return DAG.getConstant(-1, SDLoc(N), VT); auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) { unsigned NewShiftVal = Amt0 + Amt1; if (NewShiftVal >= NumBitsPerElt) { // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. if (LogicalShift) return DAG.getConstant(0, SDLoc(N), VT); NewShiftVal = NumBitsPerElt - 1; } return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0), DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8)); }; // (shift (shift X, C2), C1) -> (shift X, (C1 + C2)) if (Opcode == N0.getOpcode()) return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1)); // (shl (add X, X), C) -> (shl X, (C + 1)) if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1)) return MergeShifts(N0.getOperand(0), ShiftVal, 1); // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern: // psrad(pshufd(psllq(X,63),1,1,3,3),31) -> // pshufd(psrad(pslld(X,31),31),0,0,2,2). if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 && N0.getOpcode() == X86ISD::PSHUFD && N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) && N0->hasOneUse()) { SDValue BC = peekThroughOneUseBitcasts(N0.getOperand(0)); if (BC.getOpcode() == X86ISD::VSHLI && BC.getScalarValueSizeInBits() == 64 && BC.getConstantOperandVal(1) == 63) { SDLoc DL(N); SDValue Src = BC.getOperand(0); Src = DAG.getBitcast(VT, Src); Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src, getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG)); Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1); Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1); return Src; } } auto TryConstantFold = [&](SDValue V) { APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) return SDValue(); assert(EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"); // Undef elements need to fold to 0. It's possible SimplifyDemandedBits // created an undef input due to no input bits being demanded, but user // still expects 0 in other bits. for (unsigned i = 0, e = EltBits.size(); i != e; ++i) { APInt &Elt = EltBits[i]; if (UndefElts[i]) Elt = 0; else if (X86ISD::VSHLI == Opcode) Elt <<= ShiftVal; else if (X86ISD::VSRAI == Opcode) Elt.ashrInPlace(ShiftVal); else Elt.lshrInPlace(ShiftVal); } // Reset undef elements since they were zeroed above. UndefElts = 0; return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); }; // Constant Folding. if (N->isOnlyUserOf(N0.getNode())) { if (SDValue C = TryConstantFold(N0)) return C; // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1)) // Don't break NOT patterns. SDValue BC = peekThroughOneUseBitcasts(N0); if (ISD::isBitwiseLogicOp(BC.getOpcode()) && BC->isOnlyUserOf(BC.getOperand(1).getNode()) && !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) { if (SDValue RHS = TryConstantFold(BC.getOperand(1))) { SDLoc DL(N); SDValue LHS = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(VT, BC.getOperand(0)), N1); return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS); } } } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt), DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); unsigned Opcode = N->getOpcode(); assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"); SDValue Vec = N->getOperand(0); SDValue Scl = N->getOperand(1); SDValue Idx = N->getOperand(2); // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt). if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx)) return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl); if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) { unsigned NumBitsPerElt = VT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt), DCI)) return SDValue(N, 0); } // Attempt to combine insertion patterns to a shuffle. if (VT.isSimple() && DCI.isAfterLegalizeDAG()) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } return SDValue(); } /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for /// OR -> CMPNEQSS. static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned opcode; // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but // we're requiring SSE2 for both. if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CMP0 = N0.getOperand(1); SDValue CMP1 = N1.getOperand(1); SDLoc DL(N); // The SETCCs should both refer to the same CMP. if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1) return SDValue(); SDValue CMP00 = CMP0->getOperand(0); SDValue CMP01 = CMP0->getOperand(1); EVT VT = CMP00.getValueType(); if (VT == MVT::f32 || VT == MVT::f64 || (VT == MVT::f16 && Subtarget.hasFP16())) { bool ExpectingFlags = false; // Check for any users that want flags: for (const SDNode *U : N->uses()) { if (ExpectingFlags) break; switch (U->getOpcode()) { default: case ISD::BR_CC: case ISD::BRCOND: case ISD::SELECT: ExpectingFlags = true; break; case ISD::CopyToReg: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; } } if (!ExpectingFlags) { enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { X86::CondCode tmp = cc0; cc0 = cc1; cc1 = tmp; } if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { // FIXME: need symbolic constants for these magic numbers. // See X86ATTInstPrinter.cpp:printSSECC(). unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; if (Subtarget.hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8)); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, DAG.getConstant(0, DL, MVT::v16i1), FSetCC, DAG.getIntPtrConstant(0, DL)); return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL, N->getSimpleValueType(0)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; if (is64BitFP && !Subtarget.is64Bit()) { // On a 32-bit target, we cannot bitcast the 64-bit float to a // 64-bit integer, since that's not a legal type. Since // OnesOrZeroesF is all ones or all zeroes, we don't need all the // bits, but can do this little dance to extract the lowest 32 bits // and work with those going forward. SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, OnesOrZeroesF); SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32, DAG.getIntPtrConstant(0, DL)); IntVT = MVT::i32; } SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, DAG.getConstant(1, DL, IntVT)); SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); return OneBitOfTruth; } } } } return SDValue(); } /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP"); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); SDValue X, Y; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (SDValue Not = IsNOT(N0, DAG)) { X = Not; Y = N1; } else if (SDValue Not = IsNOT(N1, DAG)) { X = Not; Y = N0; } else return SDValue(); X = DAG.getBitcast(VT, X); Y = DAG.getBitcast(VT, Y); return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); } /// Try to fold: /// and (vector_shuffle /// (insert_vector_elt undef, (xor X, -1), Z), undef), Y /// -> /// andnp (vector_shuffle /// (insert_vector_elt undef, X, Z), undef), Y static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP"); EVT VT = N->getValueType(0); // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original // value and require extra moves. if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX()))) return SDValue(); auto GetNot = [&DAG](SDValue V) { auto *SVN = dyn_cast(peekThroughOneUseBitcasts(V)); // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all // end-users are ISD::AND including cases // (and(extract_vector_element(SVN), Y)). if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() || !SVN->getOperand(1).isUndef()) { return SDValue(); } SDValue IVEN = SVN->getOperand(0); if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT || !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse()) return SDValue(); if (!isa(IVEN.getOperand(2)) || IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex()) return SDValue(); SDValue Src = IVEN.getOperand(1); if (SDValue Not = IsNOT(Src, DAG)) { SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not); SDValue NotIVEN = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(), IVEN.getOperand(0), NotSrc, IVEN.getOperand(2)); return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN, SVN->getOperand(1), SVN->getMask()); } return SDValue(); }; SDValue X, Y; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue Not = GetNot(N0)) { X = Not; Y = N1; } else if (SDValue Not = GetNot(N1)) { X = Not; Y = N0; } else return SDValue(); X = DAG.getBitcast(VT, X); Y = DAG.getBitcast(VT, Y); SDLoc DL(N); // We do not split for SSE at all, but we need to split vectors for AVX1 and // AVX2. if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) { SDValue LoX, HiX; std::tie(LoX, HiX) = splitVector(X, DAG, DL); SDValue LoY, HiY; std::tie(LoY, HiY) = splitVector(Y, DAG, DL); EVT SplitVT = LoX.getValueType(); SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY}); SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY}); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV}); } if (TLI.isTypeLegal(VT)) return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y}); return SDValue(); } // Try to widen AND, OR and XOR nodes to VT in order to remove casts around // logical operations, like in the example below. // or (and (truncate x, truncate y)), // (xor (truncate z, build_vector (constants))) // Given a target type \p VT, we generate // or (and x, y), (xor z, zext(build_vector (constants))) // given x, y and z are of type \p VT. We can do so, if operands are either // truncates from VT types, the second operand is a vector of constants or can // be recursively promoted. static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth) { // Limit recursion to avoid excessive compile times. if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); if (!ISD::isBitwiseLogicOp(N.getOpcode())) return SDValue(); SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT)) return SDValue(); if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1)) N0 = NN0; else { // The left side has to be a trunc. if (N0.getOpcode() != ISD::TRUNCATE) return SDValue(); // The type of the truncated inputs. if (N0.getOperand(0).getValueType() != VT) return SDValue(); N0 = N0.getOperand(0); } if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1)) N1 = NN1; else { // The right side has to be a 'trunc' or a (foldable) constant. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; if (RHSTrunc) N1 = N1.getOperand(0); else if (SDValue Cst = DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1})) N1 = Cst; else return SDValue(); } return DAG.getNode(N.getOpcode(), DL, VT, N0, N1); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. // Even with AVX-512 this is still useful for removing casts around logical // operations on vXi1 mask types. static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N.getValueType(); assert(VT.isVector() && "Expected vector type"); assert((N.getOpcode() == ISD::ANY_EXTEND || N.getOpcode() == ISD::ZERO_EXTEND || N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N.getOperand(0); EVT NarrowVT = Narrow.getValueType(); // Generate the wide operation. SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0); if (!Op) return SDValue(); switch (N.getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: return DAG.getZeroExtendInReg(Op, DL, NarrowVT); case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); } } static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) { unsigned FPOpcode; switch (Opcode) { // clang-format off default: llvm_unreachable("Unexpected input node for FP logic conversion"); case ISD::AND: FPOpcode = X86ISD::FAND; break; case ISD::OR: FPOpcode = X86ISD::FOR; break; case ISD::XOR: FPOpcode = X86ISD::FXOR; break; // clang-format on } return FPOpcode; } /// If both input operands of a logic op are being cast from floating-point /// types or FP compares, try to convert this into a floating-point logic node /// to avoid unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) || (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC))) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); EVT N00Type = N00.getValueType(); EVT N10Type = N10.getValueType(); // Ensure that both types are the same and are legal scalar fp types. if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) || (Subtarget.hasSSE2() && N00Type == MVT::f64) || (Subtarget.hasFP16() && N00Type == MVT::f16))) return SDValue(); if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) { unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); return DAG.getBitcast(VT, FPLogic); } if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); ISD::CondCode CC0 = cast(N0.getOperand(2))->get(); ISD::CondCode CC1 = cast(N1.getOperand(2))->get(); // The vector ISA for FP predicates is incomplete before AVX, so converting // COMIS* to CMPS* may not be a win before AVX. if (!Subtarget.hasAVX() && !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1))) return SDValue(); // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*) // and vector logic: // logic (setcc N00, N01), (setcc N10, N11) --> // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0 unsigned NumElts = 128 / N00Type.getSizeInBits(); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts); EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL); SDValue N01 = N0.getOperand(1); SDValue N11 = N1.getOperand(1); SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00); SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01); SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10); SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11); SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0); SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1); SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex); } // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y)) // to reduce XMM->GPR traffic. static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) { unsigned Opc = N->getOpcode(); assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Both operands must be single use MOVMSK. if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() || N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse()) return SDValue(); SDValue Vec0 = N0.getOperand(0); SDValue Vec1 = N1.getOperand(0); EVT VecVT0 = Vec0.getValueType(); EVT VecVT1 = Vec1.getValueType(); // Both MOVMSK operands must be from vectors of the same size and same element // size, but its OK for a fp/int diff. if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() || VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits()) return SDValue(); SDLoc DL(N); unsigned VecOpc = VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc; SDValue Result = DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1)); return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); } // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z). // NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws // handles in InstCombine. static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) { unsigned Opc = N->getOpcode(); assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // Both operands must be single use. if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); // Search for matching shifts. SDValue BC0 = peekThroughOneUseBitcasts(N0); SDValue BC1 = peekThroughOneUseBitcasts(N1); unsigned BCOpc = BC0.getOpcode(); EVT BCVT = BC0.getValueType(); if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType()) return SDValue(); switch (BCOpc) { case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: { if (BC0.getOperand(1) != BC1.getOperand(1)) return SDValue(); SDLoc DL(N); SDValue BitOp = DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0)); SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1)); return DAG.getBitcast(VT, Shift); } } return SDValue(); } // Attempt to fold: // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)). // TODO: Handle PACKUS handling. static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG) { unsigned Opc = N->getOpcode(); assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); // Both operands must be single use. if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); // Search for matching packs. N0 = peekThroughOneUseBitcasts(N0); N1 = peekThroughOneUseBitcasts(N1); if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS) return SDValue(); MVT DstVT = N0.getSimpleValueType(); if (DstVT != N1.getSimpleValueType()) return SDValue(); MVT SrcVT = N0.getOperand(0).getSimpleValueType(); unsigned NumSrcBits = SrcVT.getScalarSizeInBits(); // Limit to allsignbits packing. if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits || DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits || DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits || DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits) return SDValue(); SDLoc DL(N); SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0)); SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1)); return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS)); } /// If this is a zero/all-bits result that is bitwise-anded with a low bits /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' /// with a shift-right to eliminate loading the vector constant mask value. static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); EVT VT = Op0.getValueType(); if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger()) return SDValue(); // Try to convert an "is positive" signbit masking operation into arithmetic // shift and "andn". This saves a materialization of a -1 vector constant. // The "is negative" variant should be handled more generally because it only // requires "and" rather than "andn": // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y // // This is limited to the original type to avoid producing even more bitcasts. // If the bitcasts can't be eliminated, then it is unlikely that this fold // will be profitable. if (N->getValueType(0) == VT && supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) { SDValue X, Y; if (Op1.getOpcode() == X86ISD::PCMPGT && isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) { X = Op1.getOperand(0); Y = Op0; } else if (Op0.getOpcode() == X86ISD::PCMPGT && isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) { X = Op0.getOperand(0); Y = Op1; } if (X && Y) { SDLoc DL(N); SDValue Sra = getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X, VT.getScalarSizeInBits() - 1, DAG); return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y); } } APInt SplatVal; if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask()) return SDValue(); // Don't prevent creation of ANDN. if (isBitwiseNot(Op0)) return SDValue(); if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL)) return SDValue(); unsigned EltBitWidth = VT.getScalarSizeInBits(); if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) return SDValue(); SDLoc DL(N); unsigned ShiftVal = SplatVal.countr_one(); SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } // Get the index node from the lowered DAG of a GEP IR instruction with one // indexing dimension. static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { if (Ld->isIndexed()) return SDValue(); SDValue Base = Ld->getBasePtr(); if (Base.getOpcode() != ISD::ADD) return SDValue(); SDValue ShiftedIndex = Base.getOperand(0); if (ShiftedIndex.getOpcode() != ISD::SHL) return SDValue(); return ShiftedIndex.getOperand(0); } static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { return Subtarget.hasBMI2() && (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit())); } // This function recognizes cases where X86 bzhi instruction can replace and // 'and-load' sequence. // In case of loading integer value from an array of constants which is defined // as follows: // // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} // // then applying a bitwise and on the result with another input. // It's equivalent to performing bzhi (zero high bits) on the input, with the // same index of the load. static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Node->getSimpleValueType(0); SDLoc dl(Node); // Check if subtarget has BZHI instruction for the node's type if (!hasBZHI(Subtarget, VT)) return SDValue(); // Try matching the pattern for both operands. for (unsigned i = 0; i < 2; i++) { SDValue N = Node->getOperand(i); LoadSDNode *Ld = dyn_cast(N.getNode()); // continue if the operand is not a load instruction if (!Ld) return SDValue(); const Value *MemOp = Ld->getMemOperand()->getValue(); if (!MemOp) return SDValue(); if (const GetElementPtrInst *GEP = dyn_cast(MemOp)) { if (GlobalVariable *GV = dyn_cast(GEP->getOperand(0))) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) { Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); if (!isa(Init) || !Ty->getArrayElementType()->isIntegerTy() || Ty->getArrayElementType()->getScalarSizeInBits() != VT.getSizeInBits() || Ty->getArrayNumElements() > Ty->getArrayElementType()->getScalarSizeInBits()) continue; // Check if the array's constant elements are suitable to our case. uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); bool ConstantsMatch = true; for (uint64_t j = 0; j < ArrayElementCount; j++) { auto *Elem = cast(Init->getAggregateElement(j)); if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { ConstantsMatch = false; break; } } if (!ConstantsMatch) continue; // Do the transformation (For 32-bit type): // -> (and (load arr[idx]), inp) // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) // that will be replaced with one bzhi instruction. SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32); // Get the Node which indexes into the array. SDValue Index = getIndexFromUnindexedLoad(Ld); if (!Index) return SDValue(); Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32); SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index); Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub); SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); } } } } return SDValue(); } // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C) // Where C is a mask containing the same number of bits as the setcc and // where the setcc will freely 0 upper bits of k-register. We can replace the // undef in the concat with 0s and remove the AND. This mainly helps with // v2i1/v4i1 setcc being casted to scalar. static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); EVT VT = N->getValueType(0); // Make sure this is an AND with constant. We will check the value of the // constant later. auto *C1 = dyn_cast(N->getOperand(1)); if (!C1) return SDValue(); // This is implied by the ConstantSDNode. assert(!VT.isVector() && "Expected scalar VT!"); SDValue Src = N->getOperand(0); if (!Src.hasOneUse()) return SDValue(); // (Optionally) peek through any_extend(). if (Src.getOpcode() == ISD::ANY_EXTEND) { if (!Src.getOperand(0).hasOneUse()) return SDValue(); Src = Src.getOperand(0); } if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse()) return SDValue(); Src = Src.getOperand(0); EVT SrcVT = Src.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 || !TLI.isTypeLegal(SrcVT)) return SDValue(); if (Src.getOpcode() != ISD::CONCAT_VECTORS) return SDValue(); // We only care about the first subvector of the concat, we expect the // other subvectors to be ignored due to the AND if we make the change. SDValue SubVec = Src.getOperand(0); EVT SubVecVT = SubVec.getValueType(); // The RHS of the AND should be a mask with as many bits as SubVec. if (!TLI.isTypeLegal(SubVecVT) || !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements())) return SDValue(); // First subvector should be a setcc with a legal result type or a // AND containing at least one setcc with a legal result type. auto IsLegalSetCC = [&](SDValue V) { if (V.getOpcode() != ISD::SETCC) return false; EVT SetccVT = V.getOperand(0).getValueType(); if (!TLI.isTypeLegal(SetccVT) || !(Subtarget.hasVLX() || SetccVT.is512BitVector())) return false; if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32)) return false; return true; }; if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND && (IsLegalSetCC(SubVec.getOperand(0)) || IsLegalSetCC(SubVec.getOperand(1)))))) return SDValue(); // We passed all the checks. Rebuild the concat_vectors with zeroes // and cast it back to VT. SDLoc dl(N); SmallVector Ops(Src.getNumOperands(), DAG.getConstant(0, dl, SubVecVT)); Ops[0] = SubVec; SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, Ops); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits()); return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT); } static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth) { // We don't want to go crazy with the recursion here. This isn't a super // important optimization. static constexpr unsigned kMaxDepth = 2; // Only do this re-ordering if op has one use. if (!Op.hasOneUse()) return SDValue(); SDLoc DL(Op); // If we hit another assosiative op, recurse further. if (Op.getOpcode() == Opc) { // Done recursing. if (Depth++ >= kMaxDepth) return SDValue(); for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) if (SDValue R = getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth)) return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R, Op.getOperand(1 - OpIdx)); } else if (Op.getOpcode() == ISD::SUB) { if (Opc == ISD::AND) { // BLSI: (and x, (sub 0, x)) if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq) return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op); } // Opc must be ISD::AND or ISD::XOR // BLSR: (and x, (sub x, 1)) // BLSMSK: (xor x, (sub x, 1)) if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq) return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op); } else if (Op.getOpcode() == ISD::ADD) { // Opc must be ISD::AND or ISD::XOR // BLSR: (and x, (add x, -1)) // BLSMSK: (xor x, (add x, -1)) if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq) return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op); } return SDValue(); } static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); // Make sure this node is a candidate for BMI instructions. if (!Subtarget.hasBMI() || !VT.isScalarInteger() || (VT != MVT::i32 && VT != MVT::i64)) return SDValue(); assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR); // Try and match LHS and RHS. for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) if (SDValue OpMatch = getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx), N->getOperand(1 - OpIdx), 0)) return OpMatch; return SDValue(); } static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST) { // cmp(setcc(cc, X), 0) // brcond ne // -> // X // brcond cc // sub(setcc(cc, X), 1) // brcond ne // -> // X // brcond ~cc // // if only flag has users SDValue SetCC = N->getOperand(0); if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse()) return SDValue(); // Check the only user of flag is `brcond ne`. SDNode *BrCond = *Flag->uses().begin(); if (BrCond->getOpcode() != X86ISD::BRCOND) return SDValue(); unsigned CondNo = 2; if (static_cast(BrCond->getConstantOperandVal(CondNo)) != X86::COND_NE) return SDValue(); SDValue X = SetCC.getOperand(1); // sub has two results while X only have one. DAG combine assumes the value // type matches. if (N->getOpcode() == X86ISD::SUB) X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N)); SDValue CCN = SetCC.getOperand(0); X86::CondCode CC = static_cast(CCN->getAsAPIntVal().getSExtValue()); X86::CondCode OppositeCC = X86::GetOppositeBranchCondition(CC); // Update CC for the consumer of the flag. // The old CC is `ne`. Hence, when comparing the result with 0, we are // checking if the second condition evaluates to true. When comparing the // result with 1, we are checking uf the second condition evaluates to false. SmallVector Ops(BrCond->op_values()); if (isNullConstant(N->getOperand(1))) Ops[CondNo] = CCN; else if (isOneConstant(N->getOperand(1))) Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8); else llvm_unreachable("expect constant 0 or 1"); SDValue NewBrCond = DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops); // Avoid self-assign error b/c CC1 can be `e/ne`. if (BrCond != NewBrCond.getNode()) DCI.CombineTo(BrCond, NewBrCond); return X; } static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST) { // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y))) // -> // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0)) // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0))) // -> // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0)) // // where cflags is determined by cc1. if (!ST.hasCCMP()) return SDValue(); SDValue SetCC0 = N->getOperand(0); SDValue SetCC1 = N->getOperand(1); if (SetCC0.getOpcode() != X86ISD::SETCC || SetCC1.getOpcode() != X86ISD::SETCC) return SDValue(); auto GetCombineToOpc = [&](SDValue V) -> unsigned { SDValue Op = V.getOperand(1); unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::SUB) return X86ISD::CCMP; if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1))) return X86ISD::CTEST; return 0U; }; unsigned NewOpc = 0; // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP // appear on the right. if (!(NewOpc = GetCombineToOpc(SetCC1))) { std::swap(SetCC0, SetCC1); if (!(NewOpc = GetCombineToOpc(SetCC1))) return SDValue(); } X86::CondCode CC0 = static_cast(SetCC0.getConstantOperandVal(0)); // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP. if (CC0 == X86::COND_P || CC0 == X86::COND_NP) return SDValue(); bool IsOR = N->getOpcode() == ISD::OR; // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC // evaluates to true. So we need to inverse CC0 as SrcCC when the logic // operator is OR. Similar for CC1. SDValue SrcCC = IsOR ? DAG.getTargetConstant(X86::GetOppositeBranchCondition(CC0), SDLoc(SetCC0.getOperand(0)), MVT::i8) : SetCC0.getOperand(0); SDValue CC1N = SetCC1.getOperand(0); X86::CondCode CC1 = static_cast(CC1N->getAsAPIntVal().getSExtValue()); X86::CondCode OppositeCC1 = X86::GetOppositeBranchCondition(CC1); X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1; SDLoc DL(N); SDValue CFlags = DAG.getTargetConstant( X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8); SDValue Sub = SetCC1.getOperand(1); // Replace any uses of the old flag produced by SUB/CMP with the new one // produced by CCMP/CTEST. SDValue CCMP = (NewOpc == X86ISD::CCMP) ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32, {Sub.getOperand(0), Sub.getOperand(1), CFlags, SrcCC, SetCC0.getOperand(1)}) : DAG.getNode(X86ISD::CTEST, DL, MVT::i32, {Sub.getOperand(0), Sub.getOperand(0), CFlags, SrcCC, SetCC0.getOperand(1)}); return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP}); } static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If this is SSE1 only convert to FAND to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, DAG.getNode(X86ISD::FAND, dl, MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } // Use a 32-bit and+zext if upper bits known zero. if (VT == MVT::i64 && Subtarget.is64Bit() && !isa(N1)) { APInt HiMask = APInt::getHighBitsSet(64, 32); if (DAG.MaskedValueIsZero(N1, HiMask) || DAG.MaskedValueIsZero(N0, HiMask)) { SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0); SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS)); } } // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector SrcOps; SmallVector SrcPartials; if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) && SrcOps.size() == 1) { unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) Mask = DAG.getBitcast(MaskVT, SrcOps[0]); if (Mask) { assert(SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"); SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT); Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits); return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ); } } } // InstCombine converts: // `(-x << C0) & C1` // to // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1` // This saves an IR instruction but on x86 the neg/shift version is preferable // so undo the transform. if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) { // TODO: We don't actually need a splat for this, we just need the checks to // hold for each element. ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true, /*AllowTruncation*/ false); ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true, /*AllowTruncation*/ false); if (N1C && N01C) { const APInt &MulC = N01C->getAPIntValue(); const APInt &AndC = N1C->getAPIntValue(); APInt MulCLowBit = MulC & (-MulC); if (MulC.uge(AndC) && !MulC.isPowerOf2() && (MulCLowBit + MulC).isPowerOf2()) { SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT); int32_t MulCLowBitLog = MulCLowBit.exactLogBase2(); assert(MulCLowBitLog != -1 && "Isolated lowbit is somehow not a power of 2!"); SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg, DAG.getConstant(MulCLowBitLog, dl, VT)); return DAG.getNode(ISD::AND, dl, VT, Shift, N1); } } } if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget)) return SetCC; if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget)) return V; if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) return R; if (SDValue R = combineBitOpWithShift(N, DAG)) return R; if (SDValue R = combineBitOpWithPACK(N, DAG)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget)) return R; if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue R = combineAndNotIntoANDNP(N, DAG)) return R; if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) // iff c2 is all/no bits mask - i.e. a select-with-zero mask. // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW? if (VT.isVector() && getTargetConstantFromNode(N1)) { unsigned Opc0 = N0.getOpcode(); if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) && getTargetConstantFromNode(N0.getOperand(1)) && DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() && N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) { SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1); return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul); } } // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant // avoids slow variable shift (moving shift amount to ECX etc.) if (isOneConstant(N1) && N0->hasOneUse()) { SDValue Src = N0; while ((Src.getOpcode() == ISD::ZERO_EXTEND || Src.getOpcode() == ISD::TRUNCATE) && Src.getOperand(0)->hasOneUse()) Src = Src.getOperand(0); bool ContainsNOT = false; X86::CondCode X86CC = X86::COND_B; // Peek through AND(NOT(SRL(X,Y)),1). if (isBitwiseNot(Src)) { Src = Src.getOperand(0); X86CC = X86::COND_AE; ContainsNOT = true; } if (Src.getOpcode() == ISD::SRL && !isa(Src.getOperand(1))) { SDValue BitNo = Src.getOperand(1); Src = Src.getOperand(0); // Peek through AND(SRL(NOT(X),Y),1). if (isBitwiseNot(Src)) { Src = Src.getOperand(0); X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE; ContainsNOT = true; } // If we have BMI2 then SHRX should be faster for i32/i64 cases. if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32)) if (SDValue BT = getBT(Src, BitNo, dl, DAG)) return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT); } } if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { // Attempt to recursively combine a bitmask AND with shuffles. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // If either operand is a constant mask, then only the elements that aren't // zero are actually demanded by the other operand. auto GetDemandedMasks = [&](SDValue Op) { APInt UndefElts; SmallVector EltBits; int NumElts = VT.getVectorNumElements(); int EltSizeInBits = VT.getScalarSizeInBits(); APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); APInt DemandedElts = APInt::getAllOnes(NumElts); if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) { DemandedBits.clearAllBits(); DemandedElts.clearAllBits(); for (int I = 0; I != NumElts; ++I) { if (UndefElts[I]) { // We can't assume an undef src element gives an undef dst - the // other src might be zero. DemandedBits.setAllBits(); DemandedElts.setBit(I); } else if (!EltBits[I].isZero()) { DemandedBits |= EltBits[I]; DemandedElts.setBit(I); } } } return std::make_pair(DemandedBits, DemandedElts); }; APInt Bits0, Elts0; APInt Bits1, Elts1; std::tie(Bits0, Elts0) = GetDemandedMasks(N1); std::tie(Bits1, Elts1) = GetDemandedMasks(N0); if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) || TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) || TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) || TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG); SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG); if (NewN0 || NewN1) return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0, NewN1 ? NewN1 : N1); } // Attempt to combine a scalar bitmask AND with an extracted shuffle. if ((VT.getScalarSizeInBits() % 8) == 0 && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa(N0.getOperand(1)) && N0->hasOneUse()) { SDValue BitMask = N1; SDValue SrcVec = N0.getOperand(0); EVT SrcVecVT = SrcVec.getValueType(); // Check that the constant bitmask masks whole bytes. APInt UndefElts; SmallVector EltBits; if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) && getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && llvm::all_of(EltBits, [](const APInt &M) { return M.isZero() || M.isAllOnes(); })) { unsigned NumElts = SrcVecVT.getVectorNumElements(); unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8; unsigned Idx = N0.getConstantOperandVal(1); // Create a root shuffle mask from the byte mask and the extracted index. SmallVector ShuffleMask(NumElts * Scale, SM_SentinelUndef); for (unsigned i = 0; i != Scale; ++i) { if (UndefElts[i]) continue; int VecIdx = Scale * Idx + i; ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx; } if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle, N0.getOperand(1)); } } if (SDValue R = combineBMILogicOp(N, DAG, Subtarget)) return R; return SDValue(); } // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y)) static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); MVT VT = N->getSimpleValueType(0); unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (!VT.isVector() || (EltSizeInBits % 8) != 0) return SDValue(); SDValue N0 = peekThroughBitcasts(N->getOperand(0)); SDValue N1 = peekThroughBitcasts(N->getOperand(1)); if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) return SDValue(); // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use // VPTERNLOG. Otherwise only do this if either mask has multiple uses already. if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) || !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse())) return SDValue(); // Attempt to extract constant byte masks. APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0, /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) return SDValue(); if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1, /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) return SDValue(); for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) { // TODO - add UNDEF elts support. if (UndefElts0[i] || UndefElts1[i]) return SDValue(); if (EltBits0[i] != ~EltBits1[i]) return SDValue(); } SDLoc DL(N); if (useVPTERNLOG(Subtarget, VT)) { // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C. // VPTERNLOG is only available as vXi32/64-bit types. MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64; MVT OpVT = MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits()); SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1)); SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0)); SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0)); SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8); SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm}, DAG, Subtarget); return DAG.getBitcast(VT, Res); } SDValue X = N->getOperand(0); SDValue Y = DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), DAG.getBitcast(VT, N1.getOperand(0))); return DAG.getNode(ISD::OR, DL, VT, X, Y); } // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern. static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { if (N->getOpcode() != ISD::OR) return false; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Canonicalize AND to LHS. if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); // Attempt to match OR(AND(M,Y),ANDNP(M,X)). if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return false; Mask = N1.getOperand(0); X = N1.getOperand(1); // Check to see if the mask appeared in both the AND and ANDNP. if (N0.getOperand(0) == Mask) Y = N0.getOperand(1); else if (N0.getOperand(1) == Mask) Y = N0.getOperand(0); else return false; // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for // ANDNP combine allows other combines to happen that prevent matching. return true; } // Try to fold: // (or (and (m, y), (pandn m, x))) // into: // (vselect m, x, y) // As a special case, try to fold: // (or (and (m, (sub 0, x)), (pandn m, x))) // into: // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); EVT VT = N->getValueType(0); if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); SDValue X, Y, Mask; if (!matchLogicBlend(N, X, Y, Mask)) return SDValue(); // Validate that X, Y, and Mask are bitcasts, and see through them. Mask = peekThroughBitcasts(Mask); X = peekThroughBitcasts(X); Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); unsigned EltBits = MaskVT.getScalarSizeInBits(); // TODO: Attempt to handle floating point cases as well? if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); // Attempt to combine to conditional negate: (sub (xor X, M), M) if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL, DAG, Subtarget)) return Res; // PBLENDVB is only available on SSE 4.1. if (!Subtarget.hasSSE41()) return SDValue(); // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops. if (Subtarget.hasVLX()) return SDValue(); MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } // Helper function for combineOrCmpEqZeroToCtlzSrl // Transforms: // seteq(cmp x, 0) // into: // srl(ctlz x), log2(bitsize(x)) // Input pattern is checked by caller. static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) { SDValue Cmp = Op.getOperand(1); EVT VT = Cmp.getOperand(0).getValueType(); unsigned Log2b = Log2_32(VT.getSizeInBits()); SDLoc dl(Op); SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0)); // The result of the shift is true or false, and on X86, the 32-bit // encoding of shr and lzcnt is more desirable. SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, DAG.getConstant(Log2b, dl, MVT::i8)); return Scc; } // Try to transform: // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) // into: // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) // Will also attempt to match more generic cases, eg: // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0))) // Only applies if the target supports the FastLZCNT feature. static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast()) return SDValue(); auto isORCandidate = [](SDValue N) { return (N->getOpcode() == ISD::OR && N->hasOneUse()); }; // Check the zero extend is extending to 32-bit or more. The code generated by // srl(ctlz) for 16-bit or less variants of the pattern would require extra // instructions to clear the upper bits. if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) || !isORCandidate(N->getOperand(0))) return SDValue(); // Check the node matches: setcc(eq, cmp 0) auto isSetCCCandidate = [](SDValue N) { return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && N->getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(N->getOperand(1).getOperand(1)) && N->getOperand(1).getValueType().bitsGE(MVT::i32); }; SDNode *OR = N->getOperand(0).getNode(); SDValue LHS = OR->getOperand(0); SDValue RHS = OR->getOperand(1); // Save nodes matching or(or, setcc(eq, cmp 0)). SmallVector ORNodes; while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) || (isORCandidate(RHS) && isSetCCCandidate(LHS)))) { ORNodes.push_back(OR); OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); LHS = OR->getOperand(0); RHS = OR->getOperand(1); } // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)). if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || !isORCandidate(SDValue(OR, 0))) return SDValue(); // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it // to // or(srl(ctlz),srl(ctlz)). // The dag combiner can then fold it into: // srl(or(ctlz, ctlz)). SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG); SDValue Ret, NewRHS; if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG))) Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS); if (!Ret) return SDValue(); // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern. while (!ORNodes.empty()) { OR = ORNodes.pop_back_val(); LHS = OR->getOperand(0); RHS = OR->getOperand(1); // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). if (RHS->getOpcode() == ISD::OR) std::swap(LHS, RHS); NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG); if (!NewRHS) return SDValue(); Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS); } return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); } static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG) { if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) return SDValue(); SDValue NotOp = And0_L->getOperand(0); if (NotOp == And1_R) std::swap(And1_R, And1_L); if (NotOp != And1_L) return SDValue(); // (~(NotOp) & And0_R) | (NotOp & And1_R) // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R EVT VT = And1_L->getValueType(0); SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); return Xor1; } /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the /// equivalent `((x ^ y) & m) ^ y)` pattern. /// This is typically a better representation for targets without a fused /// "and-not" operation. This function is intended to be called from a /// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes. static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) { // Note that masked-merge variants using XOR or ADD expressions are // normalized to OR by InstCombine so we only check for OR. assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); SDValue N0 = Node->getOperand(0); if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) return SDValue(); SDValue N1 = Node->getOperand(1); if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) return SDValue(); SDLoc DL(Node); SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); SDValue N11 = N1->getOperand(1); if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) return Result; if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) return Result; if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) return Result; if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) return Result; return SDValue(); } /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. /// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}. static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly = false) { if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); // Look through a one-use zext. if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) Y = Y.getOperand(0); X86::CondCode CC; SDValue EFLAGS; if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) { CC = (X86::CondCode)Y.getConstantOperandVal(0); EFLAGS = Y.getOperand(1); } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) && Y.hasOneUse()) { EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC); } if (!EFLAGS) return SDValue(); // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. auto *ConstantX = dyn_cast(X); if (ConstantX && !ZeroSecondOpOnly) { if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) || (IsSub && CC == X86::COND_B && ConstantX->isZero())) { // This is a complicated way to get -1 or 0 from the carry flag: // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), EFLAGS); } if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) || (IsSub && CC == X86::COND_A && ConstantX->isZero())) { if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { // Swap the operands of a SUB, and we have the same pattern as above. // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB SDValue NewSub = DAG.getNode( X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), NewEFLAGS); } } } if (CC == X86::COND_B) { // X + SETB Z --> adc X, 0 // X - SETB Z --> sbb X, 0 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(0, DL, VT), EFLAGS); } if (ZeroSecondOpOnly) return SDValue(); if (CC == X86::COND_A) { // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // // Do not flip "e > c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(0, DL, VT), NewEFLAGS); } } if (CC == X86::COND_AE) { // X + SETAE --> sbb X, -1 // X - SETAE --> adc X, -1 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(-1, DL, VT), EFLAGS); } if (CC == X86::COND_BE) { // X + SETBE --> sbb X, -1 // X - SETBE --> adc X, -1 // Try to convert COND_BE into COND_AE in an attempt to facilitate // materializing "setae reg". // // Do not flip "e <= c", where "c" is a constant, because Cmp instruction // cannot take an immediate as its first operand. // if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && EFLAGS.getValueType().isInteger() && !isa(EFLAGS.getOperand(1))) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(-1, DL, VT), NewEFLAGS); } } if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() || !X86::isZeroNode(EFLAGS.getOperand(1)) || !EFLAGS.getOperand(0).getValueType().isInteger()) return SDValue(); SDValue Z = EFLAGS.getOperand(0); EVT ZVT = Z.getValueType(); // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. if (ConstantX) { // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with // fake operands: // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) || (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) { SDValue Zero = DAG.getConstant(0, DL, ZVT); SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); } // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' // with fake operands: // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) || (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) { SDValue One = DAG.getConstant(1, DL, ZVT); SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1.getValue(1)); } } // (cmp Z, 1) sets the carry flag if Z is 0. SDValue One = DAG.getConstant(1, DL, ZVT); SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); // Add the flags type for ADC/SBB nodes. SDVTList VTs = DAG.getVTList(VT, MVT::i32); // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1)); // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); } /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. static SDValue combineAddOrSubToADCOrSBB(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) { bool IsSub = N->getOpcode() == ISD::SUB; SDValue X = N->getOperand(0); SDValue Y = N->getOperand(1); EVT VT = N->getValueType(0); if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG)) return ADCOrSBB; // Commute and try again (negate the result for subtracts). if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) { if (IsSub) ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT); return ADCOrSBB; } return SDValue(); } static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && "Unexpected opcode"); // Delegate to combineAddOrSubToADCOrSBB if we have: // // (xor/or (zero_extend (setcc)) imm) // // where imm is odd if and only if we have xor, in which case the XOR/OR are // equivalent to a SUB/ADD, respectively. if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) { if (auto *N1C = dyn_cast(N1)) { bool IsSub = N->getOpcode() == ISD::XOR; bool N1COdd = N1C->getZExtValue() & 1; if (IsSub ? N1COdd : !N1COdd) { SDLoc DL(N); EVT VT = N->getValueType(0); if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG)) return R; } } } // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2) if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ && N0.getOperand(0).getOpcode() == ISD::AND && ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && ISD::isBuildVectorAllOnes(N1.getNode())) { MVT VT = N->getSimpleValueType(0); APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(N0.getOperand(0).getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits)) { bool IsPow2OrUndef = true; for (unsigned I = 0, E = EltBits.size(); I != E; ++I) IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2(); if (IsPow2OrUndef) return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(0).getOperand(1)); } } return SDValue(); } static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If this is SSE1 only convert to FOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, DAG.getNode(X86ISD::FOR, dl, MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector SrcOps; SmallVector SrcPartials; if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) && SrcOps.size() == 1) { unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) Mask = DAG.getBitcast(MaskVT, SrcOps[0]); if (Mask) { assert(SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask"); SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT); SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT); Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits); return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE); } } } if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget)) return SetCC; if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) return R; if (SDValue R = combineBitOpWithShift(N, DAG)) return R; if (SDValue R = combineBitOpWithPACK(N, DAG)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) return R; if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it. if ((VT == MVT::i32 || VT == MVT::i64) && N0.getOpcode() == ISD::SUB && N0.hasOneUse() && isNullConstant(N0.getOperand(0))) { SDValue Cond = N0.getOperand(1); if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse()) Cond = Cond.getOperand(0); if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) { if (auto *CN = dyn_cast(N1)) { uint64_t Val = CN->getZExtValue(); if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) { X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG); SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT); R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT)); R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT)); return R; } } } } // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y). // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X). // iff the upper elements of the non-shifted arg are zero. // KUNPCK require 16+ bool vector elements. if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) { unsigned NumElts = VT.getVectorNumElements(); unsigned HalfElts = NumElts / 2; APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts); if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL && N1.getConstantOperandAPInt(1) == HalfElts && DAG.MaskedVectorIsZero(N0, UpperElts)) { return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, extractSubVector(N0, 0, DAG, dl, HalfElts), extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts)); } if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL && N0.getConstantOperandAPInt(1) == HalfElts && DAG.MaskedVectorIsZero(N1, UpperElts)) { return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, extractSubVector(N1, 0, DAG, dl, HalfElts), extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts)); } } if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { // Attempt to recursively combine an OR of shuffles. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // If either operand is a constant mask, then only the elements that aren't // allones are actually demanded by the other operand. auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) { APInt UndefElts; SmallVector EltBits; int NumElts = VT.getVectorNumElements(); int EltSizeInBits = VT.getScalarSizeInBits(); if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) return false; APInt DemandedElts = APInt::getZero(NumElts); for (int I = 0; I != NumElts; ++I) if (!EltBits[I].isAllOnes()) DemandedElts.setBit(I); return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI); }; if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } } // We should fold "masked merge" patterns when `andn` is not available. if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) if (SDValue R = foldMaskedMerge(N, DAG)) return R; if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG)) return R; return SDValue(); } /// Try to turn tests against the signbit in the form of: /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) /// into: /// SETGT(X, -1) static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { // This is only worth doing if the output type is i8 or i1. EVT ResultType = N->getValueType(0); if (ResultType != MVT::i8 && ResultType != MVT::i1) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // We should be performing an xor against a truncated shift. if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) return SDValue(); // Make sure we are performing an xor against one. if (!isOneConstant(N1)) return SDValue(); // SetCC on x86 zero extends so only act on this if it's a logical shift. SDValue Shift = N0.getOperand(0); if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) return SDValue(); // Make sure we are truncating from one of i16, i32 or i64. EVT ShiftTy = Shift.getValueType(); if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) return SDValue(); // Make sure the shift amount extracts the sign bit. if (!isa(Shift.getOperand(1)) || Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. // N.B. Using SETGE against 0 works but we want a canonical looking // comparison, using SETGT matches up with what TranslateX86CC. SDLoc DL(N); SDValue ShiftOp = Shift.getOperand(0); EVT ShiftOpTy = ShiftOp.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType); SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp, DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); if (SetCCResultType != ResultType) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond); return Cond; } /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: /// pcmpgt X, -1 /// /// This should be called before type legalization because the pattern may not /// persist after that. static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (!VT.isSimple()) return SDValue(); switch (VT.getSimpleVT().SimpleTy) { // clang-format off default: return SDValue(); case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break; // clang-format on } // There must be a shift right algebraic before the xor, and the xor must be a // 'not' operation. SDValue Shift = N->getOperand(0); SDValue Ones = N->getOperand(1); if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() || !ISD::isBuildVectorAllOnes(Ones.getNode())) return SDValue(); // The shift should be smearing the sign bit across each vector element. auto *ShiftAmt = isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true); if (!ShiftAmt || ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1)) return SDValue(); // Create a greater-than comparison against -1. We don't use the more obvious // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT); } /// Detect patterns of truncation with unsigned saturation: /// /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). /// Return the source value x to be truncated or SDValue() if the pattern was /// not matched. /// /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type), /// where C1 >= 0 and C2 is unsigned max of destination type. /// /// (truncate (smax (smin (x, C2), C1)) to dest_type) /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2. /// /// These two patterns are equivalent to: /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type) /// So return the smax(x, C1) value to be truncated or SDValue() if the /// pattern was not matched. static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL) { EVT InVT = In.getValueType(); // Saturation with truncation. We truncate from InVT to VT. assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); // Match min/max and return limit value as a parameter. auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue { if (V.getOpcode() == Opcode && ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit)) return V.getOperand(0); return SDValue(); }; APInt C1, C2; if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2)) // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according // the element size of the destination type. if (C2.isMask(VT.getScalarSizeInBits())) return UMin; if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2)) if (MatchMinMax(SMin, ISD::SMAX, C1)) if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits())) return SMin; if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1)) if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2)) if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1)) { return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1)); } return SDValue(); } /// Detect patterns of truncation with signed saturation: /// (truncate (smin ((smax (x, signed_min_of_dest_type)), /// signed_max_of_dest_type)) to dest_type) /// or: /// (truncate (smax ((smin (x, signed_max_of_dest_type)), /// signed_min_of_dest_type)) to dest_type). /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type]. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) { unsigned NumDstBits = VT.getScalarSizeInBits(); unsigned NumSrcBits = In.getScalarValueSizeInBits(); assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation"); auto MatchMinMax = [](SDValue V, unsigned Opcode, const APInt &Limit) -> SDValue { APInt C; if (V.getOpcode() == Opcode && ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit) return V.getOperand(0); return SDValue(); }; APInt SignedMax, SignedMin; if (MatchPackUS) { SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits); SignedMin = APInt(NumSrcBits, 0); } else { SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits); SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits); } if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax)) if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin)) return SMax; if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin)) if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax)) return SMin; return SDValue(); } static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2() || !VT.isVector()) return SDValue(); EVT SVT = VT.getVectorElementType(); EVT InVT = In.getValueType(); EVT InSVT = InVT.getVectorElementType(); // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is // split across two registers. We can use a packusdw+perm to clamp to 0-65535 // and concatenate at the same time. Then we can use a final vpmovuswb to // clip to 0-255. if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && InVT == MVT::v16i32 && VT == MVT::v16i8) { if (SDValue USatVal = detectSSatPattern(In, VT, true)) { // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, DL, DAG, Subtarget); assert(Mid && "Failed to pack!"); return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid); } } // vXi32 truncate instructions are available with AVX512F. // vXi16 truncate instructions are only available with AVX512BW. // For 256-bit or smaller vectors, we require VLX. // FIXME: We could widen truncates to 512 to remove the VLX restriction. // If the result type is 256-bits or larger and we have disable 512-bit // registers, we should go ahead and use the pack instructions if possible. bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || (Subtarget.hasBWI() && InSVT == MVT::i16)) && (InVT.getSizeInBits() > 128) && (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); if (!PreferAVX512 && VT.getVectorNumElements() > 1 && isPowerOf2_32(VT.getVectorNumElements()) && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (SDValue USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). if (SVT == MVT::i8 && InSVT == MVT::i32) { EVT MidVT = VT.changeVectorElementType(MVT::i16); SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, DAG, Subtarget); assert(Mid && "Failed to pack!"); SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, Subtarget); assert(V && "Failed to pack!"); return V; } else if (SVT == MVT::i8 || Subtarget.hasSSE41()) return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, Subtarget); } if (SDValue SSatVal = detectSSatPattern(In, VT)) return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, Subtarget); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) && (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) { unsigned TruncOpc = 0; SDValue SatVal; if (SDValue SSatVal = detectSSatPattern(In, VT)) { SatVal = SSatVal; TruncOpc = X86ISD::VTRUNCS; } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) { SatVal = USatVal; TruncOpc = X86ISD::VTRUNCUS; } if (SatVal) { unsigned ResElts = VT.getVectorNumElements(); // If the input type is less than 512 bits and we don't have VLX, we need // to widen to 512 bits. if (!Subtarget.hasVLX() && !InVT.is512BitVector()) { unsigned NumConcats = 512 / InVT.getSizeInBits(); ResElts *= NumConcats; SmallVector ConcatOps(NumConcats, DAG.getUNDEF(InVT)); ConcatOps[0] = SatVal; InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumConcats * InVT.getVectorNumElements()); SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps); } // Widen the result if its narrower than 128 bits. if (ResElts * SVT.getSizeInBits() < 128) ResElts = 128 / SVT.getSizeInBits(); EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts); SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } } return SDValue(); } static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { auto *Ld = cast(N); EVT RegVT = Ld->getValueType(0); SDValue Ptr = Ld->getBasePtr(); SDValue Chain = Ld->getChain(); ISD::LoadExtType Ext = Ld->getExtensionType(); if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple()) return SDValue(); if (!(RegVT.is128BitVector() || RegVT.is256BitVector())) return SDValue(); const Constant *LdC = getTargetConstantFromBasePtr(Ptr); if (!LdC) return SDValue(); auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs, ArrayRef Bits, ArrayRef UserBits) { for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) { if (Undefs[I]) continue; if (UserUndefs[I] || Bits[I] != UserBits[I]) return false; } return true; }; // Look through all other loads/broadcasts in the chain for another constant // pool entry. for (SDNode *User : Chain->uses()) { auto *UserLd = dyn_cast(User); if (User != N && UserLd && (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || User->getOpcode() == X86ISD::VBROADCAST_LOAD || ISD::isNormalLoad(User)) && UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > RegVT.getFixedSizeInBits()) { EVT UserVT = User->getValueType(0); SDValue UserPtr = UserLd->getBasePtr(); const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); // See if we are loading a constant that matches in the lower // bits of a longer constant (but from a different constant pool ptr). if (UserC && UserPtr != Ptr) { unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits(); unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits(); if (LdSize < UserSize || !ISD::isNormalLoad(User)) { APInt Undefs, UserUndefs; SmallVector Bits, UserBits; unsigned NumBits = std::min(RegVT.getScalarSizeInBits(), UserVT.getScalarSizeInBits()); if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs, Bits) && getTargetConstantBitsFromNode(SDValue(User, 0), NumBits, UserUndefs, UserBits)) { if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) { SDValue Extract = extractSubVector( SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits()); Extract = DAG.getBitcast(RegVT, Extract); return DCI.CombineTo(N, Extract, SDValue(User, 1)); } } } } } } return SDValue(); } static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { auto *Ld = cast(N); EVT RegVT = Ld->getValueType(0); EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. Also split non-temporal aligned loads on // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Fast; if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Ld->getAlign() >= Align(16)) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, *Ld->getMemOperand(), &Fast) && !Fast))) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); unsigned HalfOffset = 16; SDValue Ptr1 = Ld->getBasePtr(); SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems / 2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, Ld->getPointerInfo().getWithOffset(HalfOffset), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2); return DCI.CombineTo(N, NewVec, TF, true); } // Bool vector load - attempt to cast to an integer, as we have good // (vXiY *ext(vXi1 bitcast(iX))) handling. if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { unsigned NumElts = RegVT.getVectorNumElements(); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); if (TLI.isTypeLegal(IntVT)) { SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); } } // If we also broadcast this vector to a wider type, then just extract the // lowest subvector. if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() && (RegVT.is128BitVector() || RegVT.is256BitVector())) { SDValue Ptr = Ld->getBasePtr(); SDValue Chain = Ld->getChain(); for (SDNode *User : Chain->uses()) { auto *UserLd = dyn_cast(User); if (User != N && UserLd && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr && UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() && !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > RegVT.getFixedSizeInBits()) { SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl, RegVT.getSizeInBits()); Extract = DAG.getBitcast(RegVT, Extract); return DCI.CombineTo(N, Extract, SDValue(User, 1)); } } } if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget)) return V; // Cast ptr32 and ptr64 pointers to the default address space before a load. unsigned AddrSpace = Ld->getAddressSpace(); if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || AddrSpace == X86AS::PTR32_UPTR) { MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); if (PtrVT != Ld->getBasePtr().getSimpleValueType()) { SDValue Cast = DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0); return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast, Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); } } return SDValue(); } /// If V is a build vector of boolean constants and exactly one of those /// constants is true, return the operand index of that true element. /// Otherwise, return -1. static int getOneTrueElt(SDValue V) { // This needs to be a build vector of booleans. // TODO: Checking for the i1 type matches the IR definition for the mask, // but the mask check could be loosened to i8 or other types. That might // also require checking more than 'allOnesValue'; eg, the x86 HW // instructions only require that the MSB is set for each mask element. // The ISD::MSTORE comments/definition do not specify how the mask operand // is formatted. auto *BV = dyn_cast(V); if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) return -1; int TrueIndex = -1; unsigned NumElts = BV->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { const SDValue &Op = BV->getOperand(i); if (Op.isUndef()) continue; auto *ConstNode = dyn_cast(Op); if (!ConstNode) return -1; if (ConstNode->getAPIntValue().countr_one() >= 1) { // If we already found a one, this is too many. if (TrueIndex >= 0) return -1; TrueIndex = i; } } return TrueIndex; } /// Given a masked memory load/store operation, return true if it has one mask /// bit set. If it has one mask bit set, then also return the memory address of /// the scalar element to load/store, the vector index to insert/extract that /// scalar element, and the alignment for the scalar memory access. static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset) { int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); if (TrueMaskElt < 0) return false; // Get the address of the one scalar element that is specified by the mask // using the appropriate offset from the base pointer. EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); Offset = 0; Addr = MaskedOp->getBasePtr(); if (TrueMaskElt != 0) { Offset = TrueMaskElt * EltVT.getStoreSize(); Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::getFixed(Offset), SDLoc(MaskedOp)); } Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); Alignment = commonAlignment(MaskedOp->getOriginalAlign(), EltVT.getStoreSize()); return true; } /// If exactly one element of the mask is set for a non-extending masked load, /// it is a scalar load and vector insert. /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(ML->isUnindexed() && "Unexpected indexed masked load!"); // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; Align Alignment; unsigned Offset; if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset)) return SDValue(); // Load the one scalar element that is specified by the mask using the // appropriate offset from the base pointer. SDLoc DL(ML); EVT VT = ML->getValueType(0); EVT EltVT = VT.getVectorElementType(); EVT CastVT = VT; if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { EltVT = MVT::f64; CastVT = VT.changeVectorElementType(EltVT); } SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo().getWithOffset(Offset), Alignment, ML->getMemOperand()->getFlags()); SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru()); // Insert the loaded element into the appropriate place in the vector. SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex); Insert = DAG.getBitcast(VT, Insert); return DCI.CombineTo(ML, Insert, Load.getValue(1), true); } static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert(ML->isUnindexed() && "Unexpected indexed masked load!"); if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) return SDValue(); SDLoc DL(ML); EVT VT = ML->getValueType(0); // If we are loading the first and last elements of a vector, it is safe and // always faster to load the whole vector. Replace the masked load with a // vector load and select. unsigned NumElts = VT.getVectorNumElements(); BuildVectorSDNode *MaskBV = cast(ML->getMask()); bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0)); bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1)); if (LoadFirstElt && LoadLastElt) { SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), ML->getMemOperand()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getPassThru()); return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); } // Convert a masked load with a constant mask into a masked load and a select. // This allows the select operation to use a faster kind of select instruction // (for example, vblendvps -> vblendps). // Don't try this if the pass-through operand is already undefined. That would // cause an infinite loop because that's what we're about to create. if (ML->getPassThru().isUndef()) return SDValue(); if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode())) return SDValue(); // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. SDValue NewML = DAG.getMaskedLoad( VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(), DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), ML->getAddressingMode(), ML->getExtensionType()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru()); return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); } static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { auto *Mld = cast(N); // TODO: Expanding load with constant mask may be optimized as well. if (Mld->isExpandingLoad()) return SDValue(); if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget)) return ScalarLoad; // TODO: Do some AVX512 subsets benefit from this transform? if (!Subtarget.hasAVX512()) if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) return Blend; } // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mld->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { EVT VT = Mld->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } if (SDValue NewMask = TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) return DAG.getMaskedLoad( VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(), NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(), Mld->getAddressingMode(), Mld->getExtensionType()); } return SDValue(); } /// If exactly one element of the mask is set for a non-truncating masked store, /// it is a vector extract and scalar store. /// Note: It is expected that the degenerate cases of an all-zeros or all-ones /// mask have already been optimized in IR, so we don't bother with those here. static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. SDValue Addr, VecIndex; Align Alignment; unsigned Offset; if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset)) return SDValue(); // Extract the one scalar element that is actually being stored. SDLoc DL(MS); SDValue Value = MS->getValue(); EVT VT = Value.getValueType(); EVT EltVT = VT.getVectorElementType(); if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { EltVT = MVT::f64; EVT CastVT = VT.changeVectorElementType(EltVT); Value = DAG.getBitcast(CastVT, Value); } SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex); // Store that element at the appropriate offset from the base pointer. return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo().getWithOffset(Offset), Alignment, MS->getMemOperand()->getFlags()); } static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MaskedStoreSDNode *Mst = cast(N); if (Mst->isCompressingStore()) return SDValue(); EVT VT = Mst->getValue().getValueType(); SDLoc dl(Mst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Mst->isTruncatingStore()) return SDValue(); if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) return ScalarStore; // If the mask value has been legalized to a non-boolean vector, try to // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } if (SDValue NewMask = TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(), Mst->getOffset(), NewMask, Mst->getMemoryVT(), Mst->getMemOperand(), Mst->getAddressingMode()); } SDValue Value = Mst->getValue(); if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), Mst->getMemoryVT())) { return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), Mst->getBasePtr(), Mst->getOffset(), Mask, Mst->getMemoryVT(), Mst->getMemOperand(), Mst->getAddressingMode(), true); } return SDValue(); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast(N); EVT StVT = St->getMemoryVT(); SDLoc dl(St); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Convert a store of vXi1 into a store of iX and a bitcast. if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() && VT.getVectorElementType() == MVT::i1) { EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); StoredVal = DAG.getBitcast(NewVT, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // If this is a store of a scalar_to_vector to v1i1, just use a scalar store. // This will avoid a copy to k-register. if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR && StoredVal.getOperand(0).getValueType() == MVT::i8) { SDValue Val = StoredVal.getOperand(0); // We must store zeros to the unused bits. Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // Widen v2i1/v4i1 stores to v8i1. if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && Subtarget.hasAVX512()) { unsigned NumConcats = 8 / VT.getVectorNumElements(); // We must store zeros to the unused bits. SmallVector Ops(NumConcats, DAG.getConstant(0, dl, VT)); Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // Turn vXi1 stores of constants into a scalar store. if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 || VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) && ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) { // If its a v64i1 store without 64-bit support, we need two stores. if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32)); Lo = combinevXi1ConstantToInteger(Lo, DAG); SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl); SDValue Ch0 = DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4), St->getOriginalAlign(), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } // If we are saving a 32-byte vector and 32-byte stores are slow, such as on // Sandy Bridge, perform two 16-byte stores. unsigned Fast; if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, *St->getMemOperand(), &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); return splitVectorStore(St, DAG); } // Split under-aligned vector non-temporal stores. if (St->isNonTemporal() && StVT == VT && St->getAlign().value() < VT.getStoreSize()) { // ZMM/YMM nt-stores - either it can be stored as a series of shorter // vectors or the legalizer can scalarize it to use MOVNTI. if (VT.is256BitVector() || VT.is512BitVector()) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); return splitVectorStore(St, DAG); } // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 // to use MOVNTI. if (VT.is128BitVector() && Subtarget.hasSSE2()) { MVT NTVT = Subtarget.hasSSE4A() ? MVT::v2f64 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); return scalarizeVectorStore(St, NTVT, DAG); } } // Try to optimize v16i16->v16i8 truncating stores when BWI is not // supported, but avx512f is by extending to v16i32 and truncating. if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && St->getValue().getOpcode() == ISD::TRUNCATE && St->getValue().getOperand(0).getValueType() == MVT::v16i16 && TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue().getOperand(0)); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } // Try to fold a VTRUNCUS or VTRUNCS into a truncating store. if (!St->isTruncatingStore() && (StoredVal.getOpcode() == X86ISD::VTRUNCUS || StoredVal.getOpcode() == X86ISD::VTRUNCS) && StoredVal.hasOneUse() && TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) { bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS; return EmitTruncSStore(IsSigned, St->getChain(), dl, StoredVal.getOperand(0), St->getBasePtr(), VT, St->getMemOperand(), DAG); } // Try to fold a extract_element(VTRUNC) pattern into a truncating store. if (!St->isTruncatingStore()) { auto IsExtractedElement = [](SDValue V) { if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse()) V = V.getOperand(0); unsigned Opc = V.getOpcode(); if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) && isNullConstant(V.getOperand(1)) && V.hasOneUse() && V.getOperand(0).hasOneUse()) return V.getOperand(0); return SDValue(); }; if (SDValue Extract = IsExtractedElement(StoredVal)) { SDValue Trunc = peekThroughOneUseBitcasts(Extract); if (Trunc.getOpcode() == X86ISD::VTRUNC) { SDValue Src = Trunc.getOperand(0); MVT DstVT = Trunc.getSimpleValueType(); MVT SrcVT = Src.getSimpleValueType(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts; MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts); if (NumTruncBits == VT.getSizeInBits() && TLI.isTruncStoreLegal(SrcVT, TruncVT)) { return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(), TruncVT, St->getMemOperand()); } } } } // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { if (TLI.isTruncStoreLegal(VT, StVT)) { if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT())) return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(), DAG, dl)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); } return SDValue(); } // Cast ptr32 and ptr64 pointers to the default address space before a store. unsigned AddrSpace = St->getAddressSpace(); if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || AddrSpace == X86AS::PTR32_UPTR) { MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); if (PtrVT != St->getBasePtr().getSimpleValueType()) { SDValue Cast = DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0); return DAG.getTruncStore( St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT, St->getOriginalAlign(), St->getMemOperand()->getFlags(), St->getAAInfo()); } } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right // places to insert EMMS. This qualifies as a quick hack. // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. if (VT.getSizeInBits() != 64) return SDValue(); const Function &F = DAG.getMachineFunction().getFunction(); bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); if (!F64IsLegal || Subtarget.is64Bit()) return SDValue(); if (VT == MVT::i64 && isa(St->getValue()) && cast(St->getValue())->isSimple() && St->getChain().hasOneUse() && St->isSimple()) { auto *Ld = cast(St->getValue()); if (!ISD::isNormalLoad(Ld)) return SDValue(); // Avoid the transformation if there are multiple uses of the loaded value. if (!Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); // Lower to a single movq load/store pair. SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); // Make sure new load is placed in same chain order. DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), St->getMemOperand()); } // This is similar to the above case, but here we handle a scalar 64-bit // integer store that is extracted from a vector on a 32-bit target. // If we have SSE2, then we can treat it like a floating-point double // to get past legalization. The execution dependencies fixup pass will // choose the optimal machine instruction for the store if this really is // an integer or v2f32 rather than an f64. if (VT == MVT::i64 && St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue OldExtract = St->getOperand(1); SDValue ExtOp0 = OldExtract.getOperand(0); unsigned VecSize = ExtOp0.getValueSizeInBits(); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } return SDValue(); } static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { auto *St = cast(N); SDValue StoredVal = N->getOperand(1); MVT VT = StoredVal.getSimpleValueType(); EVT MemVT = St->getMemoryVT(); // Figure out which elements we demand. unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits(); APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } return SDValue(); } /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, /// returning the resulting values in a vector. For example, if /// A = < float a0, float a1, float a2, float a3 > /// and /// B = < float b0, float b1, float b2, float b3 > /// then the result of doing a horizontal operation on A and B is /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl &PostShuffleMask, bool ForceHorizOp) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; // Look for the following pattern: // A = < float a0, float a1, float a2, float a3 > // B = < float b0, float b1, float b2, float b3 > // and // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); unsigned NumElts = VT.getVectorNumElements(); auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, SmallVectorImpl &ShuffleMask) { bool UseSubVector = false; if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getValueType().is256BitVector() && llvm::isNullConstant(Op.getOperand(1))) { Op = Op.getOperand(0); UseSubVector = true; } SmallVector SrcOps; SmallVector SrcMask, ScaledMask; SDValue BC = peekThroughBitcasts(Op); if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) && !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) { return Op.getValueSizeInBits() == BC.getValueSizeInBits(); })) { resolveTargetShuffleInputsAndMask(SrcOps, SrcMask); if (!UseSubVector && SrcOps.size() <= 2 && scaleShuffleElements(SrcMask, NumElts, ScaledMask)) { N0 = !SrcOps.empty() ? SrcOps[0] : SDValue(); N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end()); } if (UseSubVector && SrcOps.size() == 1 && scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) { std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op)); ArrayRef Mask = ArrayRef(ScaledMask).slice(0, NumElts); ShuffleMask.assign(Mask.begin(), Mask.end()); } } }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. SDValue A, B; SmallVector LMask; GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; SmallVector RMask; GetShuffle(RHS, C, D, RMask); // At least one of the operands should be a vector shuffle. unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1); if (NumShuffles == 0) return false; if (LMask.empty()) { A = LHS; for (unsigned i = 0; i != NumElts; ++i) LMask.push_back(i); } if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) RMask.push_back(i); } // If we have an unary mask, ensure the other op is set to null. if (isUndefOrInRange(LMask, 0, NumElts)) B = SDValue(); else if (isUndefOrInRange(LMask, NumElts, NumElts * 2)) A = SDValue(); if (isUndefOrInRange(RMask, 0, NumElts)) D = SDValue(); else if (isUndefOrInRange(RMask, NumElts, NumElts * 2)) C = SDValue(); // If A and B occur in reverse order in RHS, then canonicalize by commuting // RHS operands and shuffle mask. if (A != C) { std::swap(C, D); ShuffleVectorSDNode::commuteMask(RMask); } // Check that the shuffles are both shuffling the same vectors. if (!(A == C && B == D)) return false; PostShuffleMask.clear(); PostShuffleMask.append(NumElts, SM_SentinelUndef); // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask // Check that the masks correspond to performing a horizontal operation. // AVX defines horizontal add/sub to operate independently on 128-bit lanes, // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) { // Ignore undefined components. int LIdx = LMask[i + j], RIdx = RMask[i + j]; if (LIdx < 0 || RIdx < 0 || (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; // Check that successive odd/even elements are being operated on. If not, // this is not a horizontal operation. if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) return false; // Compute the post-shuffle mask index based on where the element // is stored in the HOP result, and where it needs to be moved to. int Base = LIdx & ~1u; int Index = ((Base % NumEltsPer128BitChunk) / 2) + ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) Index += NumEltsPer64BitChunk; PostShuffleMask[i + j] = Index; } } SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. bool IsIdentityPostShuffle = isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); if (IsIdentityPostShuffle) PostShuffleMask.clear(); // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split). if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() && isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask)) return false; // If the source nodes are already used in HorizOps then always accept this. // Shuffle folding should merge these back together. auto FoundHorizUser = [&](SDNode *User) { return User->getOpcode() == HOpcode && User->getValueType(0) == VT; }; ForceHorizOp = ForceHorizOp || (llvm::any_of(NewLHS->uses(), FoundHorizUser) && llvm::any_of(NewRHS->uses(), FoundHorizUser)); // Assume a SingleSource HOP if we only shuffle one input and don't need to // shuffle the result. if (!ForceHorizOp && !shouldUseHorizontalOp(NewLHS == NewRHS && (NumShuffles < 2 || !IsIdentityPostShuffle), DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, NewLHS); RHS = DAG.getBitcast(VT, NewRHS); return true; } // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles. static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); unsigned Opcode = N->getOpcode(); bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD); SmallVector PostShuffleMask; auto MergableHorizOp = [N](unsigned HorizOpcode) { return N->hasOneUse() && N->use_begin()->getOpcode() == ISD::VECTOR_SHUFFLE && (N->use_begin()->getOperand(0).getOpcode() == HorizOpcode || N->use_begin()->getOperand(1).getOpcode() == HorizOpcode); }; switch (Opcode) { case ISD::FADD: case ISD::FSUB: if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB; if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, PostShuffleMask, MergableHorizOp(HorizOpcode))) { SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); if (!PostShuffleMask.empty()) HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, DAG.getUNDEF(VT), PostShuffleMask); return HorizBinOp; } } break; case ISD::ADD: case ISD::SUB: if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32)) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, PostShuffleMask, MergableHorizOp(HorizOpcode))) { auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops); }; SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {LHS, RHS}, HOpBuilder); if (!PostShuffleMask.empty()) HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, DAG.getUNDEF(VT), PostShuffleMask); return HorizBinOp; } } break; } return SDValue(); } // Try to combine the following nodes // t29: i64 = X86ISD::Wrapper TargetConstantPool:i64 // 0 // t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD // <(load 4 from constant-pool)> t0, t29 // [t30: v16i32 = bitcast t27] // t6: v16i32 = xor t7, t27[t30] // t11: v16f32 = bitcast t6 // t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8 // into X86ISD::VFCMULC[X86ISD::VFMULC] if possible: // t22: v16f32 = bitcast t7 // t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22 // t24: v32f16 = bitcast t23 static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); int CombineOpcode = N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC; auto combineConjugation = [&](SDValue &r) { if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) { SDValue XOR = LHS.getOperand(0); if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) { KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1)); if (XORRHS.isConstant()) { APInt ConjugationInt32 = APInt(32, 0x80000000, true); APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true); if ((XORRHS.getBitWidth() == 32 && XORRHS.getConstant() == ConjugationInt32) || (XORRHS.getBitWidth() == 64 && XORRHS.getConstant() == ConjugationInt64)) { SelectionDAG::FlagInserter FlagsInserter(DAG, N); SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0)); SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F); r = DAG.getBitcast(VT, FCMulC); return true; } } } } return false; }; SDValue Res; if (combineConjugation(Res)) return Res; std::swap(LHS, RHS); if (combineConjugation(Res)) return Res; return Res; } // Try to combine the following nodes: // FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A) static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { auto AllowContract = [&DAG](const SDNodeFlags &Flags) { return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || Flags.hasAllowContract(); }; auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) { return DAG.getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); }; auto IsVectorAllNegativeZero = [&DAG](SDValue Op) { APInt AI = APInt(32, 0x80008000, true); KnownBits Bits = DAG.computeKnownBits(Op); return Bits.getBitWidth() == 32 && Bits.isConstant() && Bits.getConstant() == AI; }; if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() || !AllowContract(N->getFlags())) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16) return SDValue(); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); bool IsConj; SDValue FAddOp1, MulOp0, MulOp1; auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract, &IsVectorAllNegativeZero, &HasNoSignedZero](SDValue N) -> bool { if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST) return false; SDValue Op0 = N.getOperand(0); unsigned Opcode = Op0.getOpcode(); if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) { if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) { MulOp0 = Op0.getOperand(0); MulOp1 = Op0.getOperand(1); IsConj = Opcode == X86ISD::VFCMULC; return true; } if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) && ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) && HasNoSignedZero(Op0->getFlags())) || IsVectorAllNegativeZero(Op0->getOperand(2)))) { MulOp0 = Op0.getOperand(0); MulOp1 = Op0.getOperand(1); IsConj = Opcode == X86ISD::VFCMADDC; return true; } } return false; }; if (GetCFmulFrom(LHS)) FAddOp1 = RHS; else if (GetCFmulFrom(RHS)) FAddOp1 = LHS; else return SDValue(); MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2); FAddOp1 = DAG.getBitcast(CVT, FAddOp1); unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC; // FIXME: How do we handle when fast math flags of FADD are different from // CFMUL's? SDValue CFmul = DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags()); return DAG.getBitcast(VT, CFmul); } /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget)) return HOp; if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget)) return COp; return SDValue(); } static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); SDLoc DL(N); if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 || SrcVT != MVT::v2f32) return SDValue(); return DAG.getNode(X86ISD::CVTP2SI, DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src, DAG.getUNDEF(SrcVT))); } /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify /// the codegen. /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove /// anything that is guaranteed to be transformed by DAGCombiner. static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"); SDValue Src = N->getOperand(0); unsigned SrcOpcode = Src.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); auto IsFreeTruncation = [VT](SDValue Op) { unsigned TruncSizeInBits = VT.getScalarSizeInBits(); // See if this has been extended from a smaller/equal size to // the truncation size, allowing a truncation to combine with the extend. unsigned Opcode = Op.getOpcode(); if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND) && Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) return true; // See if this is a single use constant which can be constant folded. // NOTE: We don't peek throught bitcasts here because there is currently // no support for constant folding truncate+bitcast+vector_of_constants. So // we'll just send up with a truncate on both operands which will // get turned back into (truncate (binop)) causing an infinite loop. return ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); }; auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1); }; // Don't combine if the operation has other uses. if (!Src.hasOneUse()) return SDValue(); // Only support vector truncation for now. // TODO: i64 scalar math would benefit as well. if (!VT.isVector()) return SDValue(); // In most cases its only worth pre-truncating if we're only facing the cost // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. switch (SrcOpcode) { case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(SrcOpcode, VT) && !TLI.isOperationLegal(SrcOpcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); [[fallthrough]]; case ISD::AND: case ISD::XOR: case ISD::OR: case ISD::ADD: case ISD::SUB: { SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(SrcOpcode, VT) && (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; } } return SDValue(); } // Try to form a MULHU or MULHS node by looking for // (trunc (srl (mul ext, ext), 16)) // TODO: This is X86 specific because we want to be able to handle wide types // before type legalization. But we can only do it if the vector will be // legalized via widening/splitting. Type legalization can't handle promotion // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG // combiner. static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // First instruction should be a right shift of a multiply. if (Src.getOpcode() != ISD::SRL || Src.getOperand(0).getOpcode() != ISD::MUL) return SDValue(); if (!Subtarget.hasSSE2()) return SDValue(); // Only handle vXi16 types that are at least 128-bits unless they will be // widened. if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) return SDValue(); // Input type should be at least vXi32. EVT InVT = Src.getValueType(); if (InVT.getVectorElementType().getSizeInBits() < 32) return SDValue(); // Need a shift by 16. APInt ShiftAmt; if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) || ShiftAmt != 16) return SDValue(); SDValue LHS = Src.getOperand(0).getOperand(0); SDValue RHS = Src.getOperand(0).getOperand(1); // Count leading sign/zero bits on both inputs - if there are enough then // truncation back to vXi16 will be cheap - either as a pack/shuffle // sequence or using AVX512 truncations. If the inputs are sext/zext then the // truncations may actually be free by peeking through to the ext source. auto IsSext = [&DAG](SDValue V) { return DAG.ComputeMaxSignificantBits(V) <= 16; }; auto IsZext = [&DAG](SDValue V) { return DAG.computeKnownBits(V).countMaxActiveBits() <= 16; }; bool IsSigned = IsSext(LHS) && IsSext(RHS); bool IsUnsigned = IsZext(LHS) && IsZext(RHS); if (!IsSigned && !IsUnsigned) return SDValue(); // Check if both inputs are extensions, which will be removed by truncation. bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::ZERO_EXTEND) && (RHS.getOpcode() == ISD::SIGN_EXTEND || RHS.getOpcode() == ISD::ZERO_EXTEND) && LHS.getOperand(0).getScalarValueSizeInBits() <= 16 && RHS.getOperand(0).getScalarValueSizeInBits() <= 16; // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on // the (bitcasted) inputs directly, and then cheaply pack/truncate the result // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU // will have to split anyway. unsigned InSizeInBits = InVT.getSizeInBits(); if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() && !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) && (InSizeInBits % 16) == 0) { EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, InVT.getSizeInBits() / 16); SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS), DAG.getBitcast(BCVT, RHS)); return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); } // Truncate back to source type. LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU; return DAG.getNode(Opc, DL, VT, LHS, RHS); } // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes // from one vector with signed bytes from another vector, adds together // adjacent pairs of 16-bit products, and saturates the result before // truncating to 16-bits. // // Which looks something like this: // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))), // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B)))))))) static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL) { if (!VT.isVector() || !Subtarget.hasSSSE3()) return SDValue(); unsigned NumElems = VT.getVectorNumElements(); EVT ScalarVT = VT.getVectorElementType(); if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems)) return SDValue(); SDValue SSatVal = detectSSatPattern(In, VT); if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) return SDValue(); // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs // of multiplies from even/odd elements. SDValue N0 = SSatVal.getOperand(0); SDValue N1 = SSatVal.getOperand(1); if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); // TODO: Handle constant vectors and use knownbits/computenumsignbits? // Canonicalize zero_extend to LHS. if (N01.getOpcode() == ISD::ZERO_EXTEND) std::swap(N00, N01); if (N11.getOpcode() == ISD::ZERO_EXTEND) std::swap(N10, N11); // Ensure we have a zero_extend and a sign_extend. if (N00.getOpcode() != ISD::ZERO_EXTEND || N01.getOpcode() != ISD::SIGN_EXTEND || N10.getOpcode() != ISD::ZERO_EXTEND || N11.getOpcode() != ISD::SIGN_EXTEND) return SDValue(); // Peek through the extends. N00 = N00.getOperand(0); N01 = N01.getOperand(0); N10 = N10.getOperand(0); N11 = N11.getOperand(0); // Ensure the extend is from vXi8. if (N00.getValueType().getVectorElementType() != MVT::i8 || N01.getValueType().getVectorElementType() != MVT::i8 || N10.getValueType().getVectorElementType() != MVT::i8 || N11.getValueType().getVectorElementType() != MVT::i8) return SDValue(); // All inputs should be build_vectors. if (N00.getOpcode() != ISD::BUILD_VECTOR || N01.getOpcode() != ISD::BUILD_VECTOR || N10.getOpcode() != ISD::BUILD_VECTOR || N11.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // N00/N10 are zero extended. N01/N11 are sign extended. // For each element, we need to ensure we have an odd element from one vector // multiplied by the odd element of another vector and the even element from // one of the same vectors being multiplied by the even element from the // other vector. So we need to make sure for each element i, this operator // is being performed: // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] SDValue ZExtIn, SExtIn; for (unsigned i = 0; i != NumElems; ++i) { SDValue N00Elt = N00.getOperand(i); SDValue N01Elt = N01.getOperand(i); SDValue N10Elt = N10.getOperand(i); SDValue N11Elt = N11.getOperand(i); // TODO: Be more tolerant to undefs. if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); auto *ConstN00Elt = dyn_cast(N00Elt.getOperand(1)); auto *ConstN01Elt = dyn_cast(N01Elt.getOperand(1)); auto *ConstN10Elt = dyn_cast(N10Elt.getOperand(1)); auto *ConstN11Elt = dyn_cast(N11Elt.getOperand(1)); if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) return SDValue(); unsigned IdxN00 = ConstN00Elt->getZExtValue(); unsigned IdxN01 = ConstN01Elt->getZExtValue(); unsigned IdxN10 = ConstN10Elt->getZExtValue(); unsigned IdxN11 = ConstN11Elt->getZExtValue(); // Add is commutative so indices can be reordered. if (IdxN00 > IdxN10) { std::swap(IdxN00, IdxN10); std::swap(IdxN01, IdxN11); } // N0 indices be the even element. N1 indices must be the next odd element. if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || IdxN11 != 2 * i + 1) return SDValue(); SDValue N00In = N00Elt.getOperand(0); SDValue N01In = N01Elt.getOperand(0); SDValue N10In = N10Elt.getOperand(0); SDValue N11In = N11Elt.getOperand(0); // First time we find an input capture it. if (!ZExtIn) { ZExtIn = N00In; SExtIn = N01In; } if (ZExtIn != N00In || SExtIn != N01In || ZExtIn != N10In || SExtIn != N11In) return SDValue(); } auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) { EVT ExtVT = Ext.getValueType(); if (ExtVT.getVectorNumElements() != NumElems * 2) { MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2); Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext, DAG.getIntPtrConstant(0, DL)); } }; ExtractVec(ZExtIn); ExtractVec(SExtIn); auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Shrink by adding truncate nodes and let DAGCombine fold with the // sources. EVT InVT = Ops[0].getValueType(); assert(InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, InVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn }, PMADDBuilder); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); // Attempt to pre-truncate inputs to arithmetic ops instead. if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) return V; // Try to detect PMADD if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL)) return PMAdd; // Try to combine truncation with signed/unsigned saturation. if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) return Val; // Try to combine PMULHUW/PMULHW for vXi16. if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget)) return V; // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { SDValue BCSrc = Src.getOperand(0); if (BCSrc.getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)). if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 && Src.hasOneUse()) return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0)); return SDValue(); } static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); SDLoc DL(N); if (SDValue SSatVal = detectSSatPattern(In, VT)) return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } /// Returns the negated value if the node \p N flips sign of FP value. /// /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000) /// or FSUB(0, x) /// AVX512F does not have FXOR, so FNEG is lowered as /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))). /// In this case we go though all bitcasts. /// This also recognizes splat of a negated value and returns the splat of that /// value. static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); // Don't recurse exponentially. if (Depth > SelectionDAG::MaxRecursionDepth) return SDValue(); unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); SDValue Op = peekThroughBitcasts(SDValue(N, 0)); EVT VT = Op->getValueType(0); // Make sure the element size doesn't change. if (VT.getScalarSizeInBits() != ScalarSize) return SDValue(); unsigned Opc = Op.getOpcode(); switch (Opc) { case ISD::VECTOR_SHUFFLE: { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!Op.getOperand(1).isUndef()) return SDValue(); if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1)) if (NegOp0.getValueType() == VT) // FIXME: Can we do better? return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT), cast(Op)->getMask()); break; } case ISD::INSERT_VECTOR_ELT: { // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF, // -V, INDEX). SDValue InsVector = Op.getOperand(0); SDValue InsVal = Op.getOperand(1); if (!InsVector.isUndef()) return SDValue(); if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1)) if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, NegInsVal, Op.getOperand(2)); break; } case ISD::FSUB: case ISD::XOR: case X86ISD::FXOR: { SDValue Op1 = Op.getOperand(1); SDValue Op0 = Op.getOperand(0); // For XOR and FXOR, we want to check if constant // bits of Op1 are sign bit masks. For FSUB, we // have to check if constant bits of Op0 are sign // bit masks and hence we swap the operands. if (Opc == ISD::FSUB) std::swap(Op0, Op1); APInt UndefElts; SmallVector EltBits; // Extract constant bits and see if they are all // sign bit masks. Ignore the undef elements. if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) { for (unsigned I = 0, E = EltBits.size(); I < E; I++) if (!UndefElts[I] && !EltBits[I].isSignMask()) return SDValue(); // Only allow bitcast from correctly-sized constant. Op0 = peekThroughBitcasts(Op0); if (Op0.getScalarValueSizeInBits() == ScalarSize) return Op0; } break; } // case } // switch return SDValue(); } static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, bool NegRes) { if (NegMul) { switch (Opcode) { // clang-format off default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FNMADD; break; case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FNMADD: Opcode = ISD::FMA; break; case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; // clang-format on } } if (NegAcc) { switch (Opcode) { // clang-format off default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FMSUB; break; case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; case X86ISD::FMSUB: Opcode = ISD::FMA; break; case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; // clang-format on } } if (NegRes) { switch (Opcode) { // For accuracy reason, we never combine fneg and fma under strict FP. // clang-format off default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FNMSUB; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMSUB: Opcode = ISD::FMA; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break; // clang-format on } } return Opcode; } /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT OrigVT = N->getValueType(0); SDValue Arg = isFNEG(DAG, N); if (!Arg) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = Arg.getValueType(); EVT SVT = VT.getScalarType(); SDLoc DL(N); // Let legalize expand this if it isn't a legal type yet. if (!TLI.isTypeLegal(VT)) return SDValue(); // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), Arg.getOperand(1), Zero); return DAG.getBitcast(OrigVT, NewNode); } bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool LegalOperations = !DCI.isBeforeLegalizeOps(); if (SDValue NegArg = TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize)) return DAG.getBitcast(OrigVT, NegArg); return SDValue(); } SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const { // fneg patterns are removable even if they have multiple uses. if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) { Cost = NegatibleCost::Cheaper; return DAG.getBitcast(Op.getValueType(), Arg); } EVT VT = Op.getValueType(); EVT SVT = VT.getScalarType(); unsigned Opc = Op.getOpcode(); SDNodeFlags Flags = Op.getNode()->getFlags(); switch (Opc) { case ISD::FMA: case X86ISD::FMSUB: case X86ISD::FNMADD: case X86ISD::FNMSUB: case X86ISD::FMADD_RND: case X86ISD::FMSUB_RND: case X86ISD::FNMADD_RND: case X86ISD::FNMSUB_RND: { if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || !(SVT == MVT::f32 || SVT == MVT::f64) || !isOperationLegal(ISD::FMA, VT)) break; // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z) // if it may have signed zeros. if (!Flags.hasNoSignedZeros()) break; // This is always negatible for free but we might be able to remove some // extra operand negations as well. SmallVector NewOps(Op.getNumOperands(), SDValue()); for (int i = 0; i != 3; ++i) NewOps[i] = getCheaperNegatedExpression( Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1); bool NegA = !!NewOps[0]; bool NegB = !!NewOps[1]; bool NegC = !!NewOps[2]; unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true); Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper : NegatibleCost::Neutral; // Fill in the non-negated ops with the original values. for (int i = 0, e = Op.getNumOperands(); i != e; ++i) if (!NewOps[i]) NewOps[i] = Op.getOperand(i); return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps); } case X86ISD::FRCP: if (SDValue NegOp0 = getNegatedExpression(Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Cost, Depth + 1)) return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0); break; } return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, ForCodeSize, Cost, Depth); } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); // If we have integer vector types available, use the integer opcodes. if (!VT.isVector() || !Subtarget.hasSSE2()) return SDValue(); SDLoc dl(N); unsigned IntBits = VT.getScalarSizeInBits(); MVT IntSVT = MVT::getIntegerVT(IntBits); MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits); SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); unsigned IntOpcode; switch (N->getOpcode()) { // clang-format off default: llvm_unreachable("Unexpected FP logic op"); case X86ISD::FOR: IntOpcode = ISD::OR; break; case X86ISD::FXOR: IntOpcode = ISD::XOR; break; case X86ISD::FAND: IntOpcode = ISD::AND; break; case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; // clang-format on } SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); return DAG.getBitcast(VT, IntOp); } /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val) static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() != ISD::XOR) return SDValue(); SDValue LHS = N->getOperand(0); if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC) return SDValue(); X86::CondCode NewCC = X86::GetOppositeBranchCondition( X86::CondCode(LHS->getConstantOperandVal(0))); SDLoc DL(N); return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"); if (Subtarget.hasFastLZCNT()) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 && (VT != MVT::i64 || !Subtarget.is64Bit())) return SDValue(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF && N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF) return SDValue(); SDValue OpCTLZ; SDValue OpSizeTM1; if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) { OpCTLZ = N1; OpSizeTM1 = N0; } else if (N->getOpcode() == ISD::SUB) { return SDValue(); } else { OpCTLZ = N0; OpSizeTM1 = N1; } if (!OpCTLZ.hasOneUse()) return SDValue(); auto *C = dyn_cast(OpSizeTM1); if (!C) return SDValue(); if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1)) return SDValue(); EVT OpVT = VT; SDValue Op = OpCTLZ.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. OpVT = MVT::i32; Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op); } SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op); if (VT == MVT::i8) Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op); return Op; } static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); // If this is SSE1 only convert to FXOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) return R; if (SDValue R = combineBitOpWithShift(N, DAG)) return R; if (SDValue R = combineBitOpWithPACK(N, DAG)) return R; if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget)) return R; if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue SetCC = foldXor1SetCC(N, DAG)) return SetCC; if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG)) return R; if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) return RV; // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST && N0.getOperand(0).getValueType().isVector() && N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) { return DAG.getBitcast( VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType())); } // Handle AVX512 mask widening. // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub)) if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() && VT.getVectorElementType() == MVT::i1 && N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() && TLI.isTypeLegal(N0.getOperand(1).getValueType())) { return DAG.getNode( ISD::INSERT_SUBVECTOR, DL, VT, N0.getOperand(0), DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()), N0.getOperand(2)); } // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2)) // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2)) // TODO: Under what circumstances could this be performed in DAGCombine? if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) && N0.getOperand(0).getOpcode() == N->getOpcode()) { SDValue TruncExtSrc = N0.getOperand(0); auto *N1C = dyn_cast(N1); auto *N001C = dyn_cast(TruncExtSrc.getOperand(1)); if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) { SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT); SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT); return DAG.getNode(ISD::XOR, DL, VT, LHS, DAG.getNode(ISD::XOR, DL, VT, RHS, N1)); } } if (SDValue R = combineBMILogicOp(N, DAG, Subtarget)) return R; return combineFneg(N, DAG, DCI, Subtarget); } static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X))) if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) { SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 && (DCI.isBeforeLegalize() || DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) && Subtarget.hasSSSE3()) { unsigned NumElts = SrcVT.getVectorNumElements(); SmallVector ReverseMask(NumElts); for (unsigned I = 0; I != NumElts; ++I) ReverseMask[I] = (NumElts - 1) - I; SDValue Rev = DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask); return DAG.getBitcast(VT, Rev); } } return SDValue(); } // Various combines to try to convert to avgceilu. static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT SVT = VT.getScalarType(); SDLoc DL(N); // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y))) // Only useful on vXi8 which doesn't have good SRA handling. if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) { APInt SignBit = APInt::getSignMask(VT.getScalarSizeInBits()); SDValue SignMask = DAG.getConstant(SignBit, DL, VT); N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask); N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask); return DAG.getNode(ISD::XOR, DL, VT, DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask); } return SDValue(); } static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); unsigned NumBits = VT.getSizeInBits(); // TODO - Constant Folding. // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnes(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } static bool isNullFPScalarOrVectorConst(SDValue V) { return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode()); } /// If a value is a scalar FP zero or a vector FP zero (potentially including /// undefined elements), return a zero constant that may be used to fold away /// that value. In the case of a vector, the returned constant will not contain /// undefined elements even if the input parameter does. This makes it suitable /// to be used as a replacement operand with operations (eg, bitwise-and) where /// an undef should not propagate. static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!isNullFPScalarOrVectorConst(V)) return SDValue(); if (V.getValueType().isVector()) return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V)); return V; } static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); // Vector types are handled in combineANDXORWithAllOnesIntoANDNP(). if (!((VT == MVT::f32 && Subtarget.hasSSE1()) || (VT == MVT::f64 && Subtarget.hasSSE2()) || (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2()))) return SDValue(); auto isAllOnesConstantFP = [](SDValue V) { if (V.getSimpleValueType().isVector()) return ISD::isBuildVectorAllOnes(V.getNode()); auto *C = dyn_cast(V); return C && C->getConstantFPValue()->isAllOnesValue(); }; // fand (fxor X, -1), Y --> fandn X, Y if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1))) return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1); // fand X, (fxor Y, -1) --> fandn Y, X if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1))) return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0); return SDValue(); } /// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FAND(0.0, x) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget)) return V; // FAND(x, 0.0) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) return V; if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget)) return V; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes. static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FANDN(0.0, x) -> x if (isNullFPScalarOrVectorConst(N->getOperand(0))) return N->getOperand(1); // FANDN(x, 0.0) -> 0.0 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) return V; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x if (isNullFPScalarOrVectorConst(N->getOperand(0))) return N->getOperand(1); // F[X]OR(x, 0.0) -> x if (isNullFPScalarOrVectorConst(N->getOperand(1))) return N->getOperand(0); if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget)) return NewVal; return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed. if (!DAG.getTarget().Options.NoNaNsFPMath || !DAG.getTarget().Options.NoSignedZerosFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes // into FMINC and FMAXC, which are Commutative operations. unsigned NewOp = 0; switch (N->getOpcode()) { default: llvm_unreachable("unknown opcode"); case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; } return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); } static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget)) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64) || (Subtarget.hasFP16() && VT == MVT::f16) || (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; // If we don't have to respect NaN inputs, this is a direct translation to x86 // min/max instructions. if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); // If one of the operands is known non-NaN use the native min/max instructions // with the non-NaN input as second operand. if (DAG.isKnownNeverNaN(Op1)) return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); if (DAG.isKnownNeverNaN(Op0)) return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags()); // If we have to respect NaN inputs, this takes at least 3 instructions. // Favor a library call when operating on a scalar and minimizing code size. if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) return SDValue(); EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); // There are 4 possibilities involving NaN inputs, and these are the required // outputs: // Op1 // Num NaN // ---------------- // Num | Max | Op0 | // Op0 ---------------- // NaN | Op1 | NaN | // ---------------- // // The SSE FP max/min instructions were not designed for this case, but rather // to implement: // Min = Op1 < Op0 ? Op1 : Op0 // Max = Op1 > Op0 ? Op1 : Op0 // // So they always return Op0 if either input is a NaN. However, we can still // use those instructions for fmaxnum by selecting away a NaN input. // If either operand is NaN, the 2nd source operand (Op0) is passed through. SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO); // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); // Convert a full vector load into vzload when not all bits are needed. SDValue In = N->getOperand(0); MVT InVT = In.getSimpleValueType(); if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast(N->getOperand(0)); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getIntegerVT(NumBits); MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { SDLoc dl(N); SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); DCI.CombineTo(N, Convert); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return SDValue(N, 0); } } return SDValue(); } static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { bool IsStrict = N->isTargetStrictFPOpcode(); EVT VT = N->getValueType(0); // Convert a full vector load into vzload when not all bits are needed. SDValue In = N->getOperand(IsStrict ? 1 : 0); MVT InVT = In.getSimpleValueType(); if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast(In); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getFloatingPointVT(NumBits); MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { SDLoc dl(N); if (IsStrict) { SDValue Convert = DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)}); DCI.CombineTo(N, Convert, Convert.getValue(1)); } else { SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); DCI.CombineTo(N, Convert); } DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return SDValue(N, 0); } } return SDValue(); } /// Do target-specific dag combines on X86ISD::ANDNP nodes. static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); MVT VT = N->getSimpleValueType(0); int NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDLoc DL(N); // ANDNP(undef, x) -> 0 // ANDNP(x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); // ANDNP(0, x) -> x if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; // ANDNP(x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N1.getNode())) return DAG.getConstant(0, DL, VT); // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1) if (ISD::isBuildVectorAllOnes(N1.getNode())) return DAG.getNOT(DL, N0, VT); // Turn ANDNP back to AND if input is inverted. if (SDValue Not = IsNOT(N0, DAG)) return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1); // Fold for better commutativity: // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)). if (N1->hasOneUse()) if (SDValue Not = IsNOT(N1, DAG)) return DAG.getNOT( DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT); // Constant Folding APInt Undefs0, Undefs1; SmallVector EltBits0, EltBits1; if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) { if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) { SmallVector ResultBits; for (int I = 0; I != NumElts; ++I) ResultBits.push_back(~EltBits0[I] & EltBits1[I]); return getConstVector(ResultBits, VT, DAG, DL); } // Constant fold NOT(N0) to allow us to use AND. // Ensure this is only performed if we can confirm that the bitcasted source // has oneuse to prevent an infinite loop with canonicalizeBitSelect. if (N0->hasOneUse()) { SDValue BC0 = peekThroughOneUseBitcasts(N0); if (BC0.getOpcode() != ISD::BITCAST) { for (APInt &Elt : EltBits0) Elt = ~Elt; SDValue Not = getConstVector(EltBits0, VT, DAG, DL); return DAG.getNode(ISD::AND, DL, VT, Not, N1); } } } // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; // If either operand is a constant mask, then only the elements that aren't // zero are actually demanded by the other operand. auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { APInt UndefElts; SmallVector EltBits; APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); APInt DemandedElts = APInt::getAllOnes(NumElts); if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) { DemandedBits.clearAllBits(); DemandedElts.clearAllBits(); for (int I = 0; I != NumElts; ++I) { if (UndefElts[I]) { // We can't assume an undef src element gives an undef dst - the // other src might be zero. DemandedBits.setAllBits(); DemandedElts.setBit(I); } else if ((Invert && !EltBits[I].isAllOnes()) || (!Invert && !EltBits[I].isZero())) { DemandedBits |= Invert ? ~EltBits[I] : EltBits[I]; DemandedElts.setBit(I); } } } return std::make_pair(DemandedBits, DemandedElts); }; APInt Bits0, Elts0; APInt Bits1, Elts1; std::tie(Bits0, Elts0) = GetDemandedMasks(N1); std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) || TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) || TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) || TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } } return SDValue(); } static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); // BT ignores high bits in the bit index operand. unsigned BitWidth = N1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } return SDValue(); } static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS; SDValue Src = N->getOperand(IsStrict ? 1 : 0); if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getLowBitsSet(8, 4); if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } // Convert a full vector load into vzload when not all bits are needed. if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { LoadSDNode *LN = cast(N->getOperand(IsStrict ? 1 : 0)); if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) { SDLoc dl(N); if (IsStrict) { SDValue Convert = DAG.getNode( N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)}); DCI.CombineTo(N, Convert, Convert.getValue(1)); } else { SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32, DAG.getBitcast(MVT::v8i16, VZLoad)); DCI.CombineTo(N, Convert); } DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); DCI.recursivelyDeleteUnusedNodes(LN); return SDValue(N, 0); } } } return SDValue(); } // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); EVT DstVT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16) return SDValue(); // Look through single use any_extends / truncs. SDValue IntermediateBitwidthOp; if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) && N0.hasOneUse()) { IntermediateBitwidthOp = N0; N0 = N0.getOperand(0); } // See if we have a single use cmov. if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse()) return SDValue(); SDValue CMovOp0 = N0.getOperand(0); SDValue CMovOp1 = N0.getOperand(1); // Make sure both operands are constants. if (!isa(CMovOp0.getNode()) || !isa(CMovOp1.getNode())) return SDValue(); SDLoc DL(N); // If we looked through an any_extend/trunc above, add one to the constants. if (IntermediateBitwidthOp) { unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode(); CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0); CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1); } CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1); CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1); EVT CMovVT = DstVT; // We do not want i16 CMOV's. Promote to i32 and truncate afterwards. if (DstVT == MVT::i16) { CMovVT = MVT::i32; CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0); CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1); } SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1, N0.getOperand(2), N0.getOperand(3)); if (CMovVT != DstVT) CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov); return CMov; } static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); if (SDValue V = combineSextInRegCmov(N, DAG)) return V; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT ExtraVT = cast(N1)->getVT(); SDLoc dl(N); // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the // both SSE and AVX2 since there is no sign-extended shift right // operation on a vector with 64-bit elements. //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); // EXTLOAD has a better solution on AVX2, // it may be replaced with X86ISD::VSEXT node. if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256()) if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); // Attempt to promote any comparison mask ops before moving the // SIGN_EXTEND_INREG in the way. if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1); if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } } return SDValue(); } /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) /// zext(add_nuw(x, C)) --> add(zext(x), C_zext) /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes /// opportunities to combine math ops, use an LEA, or use a complex addressing /// mode. This can eliminate extend, add, and shift instructions. static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Ext->getOpcode() != ISD::SIGN_EXTEND && Ext->getOpcode() != ISD::ZERO_EXTEND) return SDValue(); // TODO: This should be valid for other integer types. EVT VT = Ext->getValueType(0); if (VT != MVT::i64) return SDValue(); SDValue Add = Ext->getOperand(0); if (Add.getOpcode() != ISD::ADD) return SDValue(); SDValue AddOp0 = Add.getOperand(0); SDValue AddOp1 = Add.getOperand(1); bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND; bool NSW = Add->getFlags().hasNoSignedWrap(); bool NUW = Add->getFlags().hasNoUnsignedWrap(); NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1)); NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1)); // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding // into the 'zext' if ((Sext && !NSW) || (!Sext && !NUW)) return SDValue(); // Having a constant operand to the 'add' ensures that we are not increasing // the instruction count because the constant is extended for free below. // A constant operand can also become the displacement field of an LEA. auto *AddOp1C = dyn_cast(AddOp1); if (!AddOp1C) return SDValue(); // Don't make the 'add' bigger if there's no hope of combining it with some // other 'add' or 'shl' instruction. // TODO: It may be profitable to generate simpler LEA instructions in place // of single 'add' instructions, but the cost model for selecting an LEA // currently has a high threshold. bool HasLEAPotential = false; for (auto *User : Ext->uses()) { if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { HasLEAPotential = true; break; } } if (!HasLEAPotential) return SDValue(); // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'. int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue(); SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0); SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT); // The wider add is guaranteed to not wrap because both operands are // sign-extended. SDNodeFlags Flags; Flags.setNoSignedWrap(NSW); Flags.setNoUnsignedWrap(NUW); return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags); } // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant // operands and the result of CMOV is not used anywhere else - promote CMOV // itself instead of promoting its result. This could be beneficial, because: // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two // (or more) pseudo-CMOVs only when they go one-after-another and // getting rid of result extension code after CMOV will help that. // 2) Promotion of constant CMOV arguments is free, hence the // {ANY,SIGN,ZERO}_EXTEND will just be deleted. // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this // promotion is also good in terms of code-size. // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit // promotion). static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { SDValue CMovN = Extend->getOperand(0); if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse()) return SDValue(); EVT TargetVT = Extend->getValueType(0); unsigned ExtendOpcode = Extend->getOpcode(); SDLoc DL(Extend); EVT VT = CMovN.getValueType(); SDValue CMovOp0 = CMovN.getOperand(0); SDValue CMovOp1 = CMovN.getOperand(1); if (!isa(CMovOp0.getNode()) || !isa(CMovOp1.getNode())) return SDValue(); // Only extend to i32 or i64. if (TargetVT != MVT::i32 && TargetVT != MVT::i64) return SDValue(); // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32 // are free. if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32)) return SDValue(); // If this a zero extend to i64, we should only extend to i32 and use a free // zero extend to finish. EVT ExtendVT = TargetVT; if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND) ExtendVT = MVT::i32; CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0); CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1); SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1, CMovN.getOperand(2), CMovN.getOperand(3)); // Finish extending if needed. if (ExtendVT != TargetVT) Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res); return Res; } // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc dl(N); // Only do this combine with AVX512 for vector extends. if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC) return SDValue(); // Only combine legal element types. EVT SVT = VT.getVectorElementType(); if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64) return SDValue(); // We don't have CMPP Instruction for vxf16 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16) return SDValue(); // We can only do this if the vector size in 256 bits or less. unsigned Size = VT.getSizeInBits(); if (Size > 256 && Subtarget.useAVX512Regs()) return SDValue(); // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since // that's the only integer compares with we have. ISD::CondCode CC = cast(N0.getOperand(2))->get(); if (ISD::isUnsignedIntSetCC(CC)) return SDValue(); // Only do this combine if the extension will be fully consumed by the setcc. EVT N00VT = N0.getOperand(0).getValueType(); EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); if (Size != MatchingVecType.getSizeInBits()) return SDValue(); SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC); if (N->getOpcode() == ISD::ZERO_EXTEND) Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType()); return Res; } static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc DL(N); // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) if (!DCI.isBeforeLegalizeOps() && N0.getOpcode() == X86ISD::SETCC_CARRY) { SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0), N0->getOperand(1)); bool ReplaceOtherUses = !N0.hasOneUse(); DCI.CombineTo(N, Setcc); // Replace other uses with a truncate of the widened setcc_carry. if (ReplaceOtherUses) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc); DCI.CombineTo(N0.getNode(), Trunc); } return SDValue(N, 0); } if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0, DAG, DCI, Subtarget)) return V; if (VT.isVector()) { if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget)) return R; if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0)); } if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; return SDValue(); } // Inverting a constant vector is profitable if it can be eliminated and the // inverted vector is already present in DAG. Otherwise, it will be loaded // anyway. // // We determine which of the values can be completely eliminated and invert it. // If both are eliminable, select a vector with the first negative element. static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()) && "ConstantFP build vector expected"); // Check if we can eliminate V. We assume if a value is only used in FMAs, we // can eliminate it. Since this function is invoked for each FMA with this // vector. auto IsNotFMA = [](SDNode *Use) { return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA; }; if (llvm::any_of(V->uses(), IsNotFMA)) return SDValue(); SmallVector Ops; EVT VT = V.getValueType(); EVT EltVT = VT.getVectorElementType(); for (const SDValue &Op : V->op_values()) { if (auto *Cst = dyn_cast(Op)) { Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT)); } else { assert(Op.isUndef()); Ops.push_back(DAG.getUNDEF(EltVT)); } } SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops); if (!NV) return SDValue(); // If an inverted version cannot be eliminated, choose it instead of the // original version. if (llvm::any_of(NV->uses(), IsNotFMA)) return SDValue(NV, 0); // If the inverted version also can be eliminated, we have to consistently // prefer one of the values. We prefer a constant with a negative value on // the first place. // N.B. We need to skip undefs that may precede a value. for (const SDValue &Op : V->op_values()) { if (auto *Cst = dyn_cast(Op)) { if (Cst->isNegative()) return SDValue(); break; } } return SDValue(NV, 0); } static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode(); // Let legalize expand this if it isn't a legal type yet. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VT)) return SDValue(); SDValue A = N->getOperand(IsStrict ? 1 : 0); SDValue B = N->getOperand(IsStrict ? 2 : 1); SDValue C = N->getOperand(IsStrict ? 3 : 2); // If the operation allows fast-math and the target does not support FMA, // split this into mul+add to avoid libcall(s). SDNodeFlags Flags = N->getFlags(); if (!IsStrict && Flags.hasAllowReassociation() && TLI.isOperationExpand(ISD::FMA, VT)) { SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags); return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags); } EVT ScalarVT = VT.getScalarType(); if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) && !(ScalarVT == MVT::f16 && Subtarget.hasFP16())) return SDValue(); auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool LegalOperations = !DCI.isBeforeLegalizeOps(); if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations, CodeSize)) { V = NegV; return true; } // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(V.getOperand(1))) { SDValue Vec = V.getOperand(0); if (SDValue NegV = TLI.getCheaperNegatedExpression( Vec, DAG, LegalOperations, CodeSize)) { V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), NegV, V.getOperand(1)); return true; } } // Lookup if there is an inverted version of constant vector V in DAG. if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) { if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) { V = NegV; return true; } } return false; }; // Do not convert the passthru input of scalar intrinsics. // FIXME: We could allow negations of the lower element only. bool NegA = invertIfNegative(A); bool NegB = invertIfNegative(B); bool NegC = invertIfNegative(C); if (!NegA && !NegB && !NegC) return SDValue(); unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); // Propagate fast-math-flags to new FMA node. SelectionDAG::FlagInserter FlagsInserter(DAG, Flags); if (IsStrict) { assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4"); return DAG.getNode(NewOpcode, dl, {VT, MVT::Other}, {N->getOperand(0), A, B, C}); } else { if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, A, B, C); } } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C) static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDLoc dl(N); EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool LegalOperations = !DCI.isBeforeLegalizeOps(); SDValue N2 = N->getOperand(2); SDValue NegN2 = TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize); if (!NegN2) return SDValue(); unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), NegN2, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), NegN2); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) // FIXME: Is this needed? We don't seem to have any tests for it. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND && N0.getOpcode() == X86ISD::SETCC_CARRY) { SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0), N0->getOperand(1)); bool ReplaceOtherUses = !N0.hasOneUse(); DCI.CombineTo(N, Setcc); // Replace other uses with a truncate of the widened setcc_carry. if (ReplaceOtherUses) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc); DCI.CombineTo(N0.getNode(), Trunc); } return SDValue(N, 0); } if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; if (DCI.isBeforeLegalizeOps()) if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0, DAG, DCI, Subtarget)) return V; if (VT.isVector()) if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget)) return R; if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) return R; // TODO: Combine with any target/faux shuffle. if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { return concatSubVectors(N00, N01, DAG, dl); } } return SDValue(); } /// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just /// pre-promote its result type since vXi1 vectors don't get promoted /// during type legalization. static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC); return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc); } return SDValue(); } static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { const ISD::CondCode CC = cast(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); EVT OpVT = LHS.getValueType(); SDLoc DL(N); if (CC == ISD::SETNE || CC == ISD::SETEQ) { if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG, Subtarget)) return V; if (VT == MVT::i1) { X86::CondCode X86CC; if (SDValue V = MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC)) return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG)); } if (OpVT.isScalarInteger()) { // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0) // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0) auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) { if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) { if (N0.getOperand(0) == N1) return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT), N0.getOperand(1)); if (N0.getOperand(1) == N1) return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT), N0.getOperand(0)); } return SDValue(); }; if (SDValue AndN = MatchOrCmpEq(LHS, RHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); if (SDValue AndN = MatchOrCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0) // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0) auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) { if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) { if (N0.getOperand(0) == N1) return DAG.getNode(ISD::AND, DL, OpVT, N1, DAG.getNOT(DL, N0.getOperand(1), OpVT)); if (N0.getOperand(1) == N1) return DAG.getNode(ISD::AND, DL, OpVT, N1, DAG.getNOT(DL, N0.getOperand(0), OpVT)); } return SDValue(); }; if (SDValue AndN = MatchAndCmpEq(LHS, RHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. if (LHS.getOpcode() == ISD::TRUNCATE && LHS.getOperand(0).getScalarValueSizeInBits() >= 32 && isa(RHS) && !DCI.isBeforeLegalize()) { EVT SrcVT = LHS.getOperand(0).getValueType(); APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(), OpVT.getScalarSizeInBits()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) && TLI.isTypeLegal(LHS.getOperand(0).getValueType())) return DAG.getSetCC(DL, VT, LHS.getOperand(0), DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC); } // With C as a power of 2 and C != 0 and C != INT_MIN: // icmp eq Abs(X) C -> // (icmp eq A, C) | (icmp eq A, -C) // icmp ne Abs(X) C -> // (icmp ne A, C) & (icmp ne A, -C) // Both of these patterns can be better optimized in // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar // integers which is checked above. if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) { if (auto *C = dyn_cast(RHS)) { const APInt &CInt = C->getAPIntValue(); // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC. if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) { SDValue BaseOp = LHS.getOperand(0); SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC); SDValue SETCC1 = DAG.getSetCC( DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC); return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT, SETCC0, SETCC1); } } } } } if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { // Using temporaries to avoid messing up operand ordering for later // transformations if this doesn't work. SDValue Op0 = LHS; SDValue Op1 = RHS; ISD::CondCode TmpCC = CC; // Put build_vector on the right. if (Op0.getOpcode() == ISD::BUILD_VECTOR) { std::swap(Op0, Op1); TmpCC = ISD::getSetCCSwappedOperands(TmpCC); } bool IsSEXT0 = (Op0.getOpcode() == ISD::SIGN_EXTEND) && (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1); bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode()); if (IsSEXT0 && IsVZero1) { assert(VT == Op0.getOperand(0).getValueType() && "Unexpected operand type"); if (TmpCC == ISD::SETGT) return DAG.getConstant(0, DL, VT); if (TmpCC == ISD::SETLE) return DAG.getConstant(1, DL, VT); if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE) return DAG.getNOT(DL, Op0.getOperand(0), VT); assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"); return Op0.getOperand(0); } } // Try and make unsigned vector comparison signed. On pre AVX512 targets there // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to // use `PCMPGT` if the result is mean to stay in a vector (and if its going to // a mask, there are signed AVX512 comparisons). if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) { bool CanMakeSigned = false; if (ISD::isUnsignedIntSetCC(CC)) { KnownBits CmpKnown = DAG.computeKnownBits(LHS).intersectWith(DAG.computeKnownBits(RHS)); // If we know LHS/RHS share the same sign bit at each element we can // make this signed. // NOTE: `computeKnownBits` on a vector type aggregates common bits // across all lanes. So a pattern where the sign varies from lane to // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be // missed. We could get around this by demanding each lane // independently, but this isn't the most important optimization and // that may eat into compile time. CanMakeSigned = CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet(); } if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) { SDValue LHSOut = LHS; SDValue RHSOut = RHS; ISD::CondCode NewCC = CC; switch (CC) { case ISD::SETGE: case ISD::SETUGE: if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true, /*NSW*/ true)) LHSOut = NewLHS; else if (SDValue NewRHS = incDecVectorConstant( RHS, DAG, /*IsInc*/ false, /*NSW*/ true)) RHSOut = NewRHS; else break; [[fallthrough]]; case ISD::SETUGT: NewCC = ISD::SETGT; break; case ISD::SETLE: case ISD::SETULE: if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false, /*NSW*/ true)) LHSOut = NewLHS; else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true, /*NSW*/ true)) RHSOut = NewRHS; else break; [[fallthrough]]; case ISD::SETULT: // Will be swapped to SETGT in LowerVSETCC*. NewCC = ISD::SETLT; break; default: break; } if (NewCC != CC) { if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut, NewCC, DL, DAG, Subtarget)) return R; return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC); } } } if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget)) return R; // In the middle end transforms: // `(or (icmp eq X, C), (icmp eq X, C+1))` // -> `(icmp ult (add x, -C), 2)` // Likewise inverted cases with `ugt`. // // Since x86, pre avx512, doesn't have unsigned vector compares, this results // in worse codegen. So, undo the middle-end transform and go back to `(or // (icmp eq), (icmp eq))` form. // Also skip AVX1 with ymm vectors, as the umin approach combines better than // the xmm approach. // // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp // ne))` as it doesn't end up instruction positive. // TODO: We might want to do this for avx512 as well if we `sext` the result. if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() && ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD && !Subtarget.hasAVX512() && (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() || Subtarget.hasAVX2()) && LHS.hasOneUse()) { APInt CmpC; SDValue AddC = LHS.getOperand(1); if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) && DAG.isConstantIntBuildVectorOrConstantInt(AddC)) { // See which form we have depending on the constant/condition. SDValue C0 = SDValue(); SDValue C1 = SDValue(); // If we had `(add x, -1)` and can lower with `umin`, don't transform as // we will end up generating an additional constant. Keeping in the // current form has a slight latency cost, but it probably worth saving a // constant. if (ISD::isConstantSplatVectorAllOnes(AddC.getNode()) && DAG.getTargetLoweringInfo().isOperationLegal(ISD::UMIN, OpVT)) { // Pass } // Normal Cases else if ((CC == ISD::SETULT && CmpC == 2) || (CC == ISD::SETULE && CmpC == 1)) { // These will constant fold. C0 = DAG.getNegative(AddC, DL, OpVT); C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0, DAG.getAllOnesConstant(DL, OpVT)); } // Inverted Cases else if ((CC == ISD::SETUGT && (-CmpC) == 3) || (CC == ISD::SETUGE && (-CmpC) == 2)) { // These will constant fold. C0 = DAG.getNOT(DL, AddC, OpVT); C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0, DAG.getAllOnesConstant(DL, OpVT)); } if (C0 && C1) { SDValue NewLHS = DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ); SDValue NewRHS = DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ); return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS); } } } // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early // to avoid scalarization via legalization because v4i32 is not a legal type. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 && LHS.getValueType() == MVT::v4f32) return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); // X pred 0.0 --> X pred -X // If the negation of X already exists, use it in the comparison. This removes // the need to materialize 0.0 and allows matching to SSE's MIN/MAX // instructions in patterns with a 'select' node. if (isNullFPScalarOrVectorConst(RHS)) { SDVTList FNegVT = DAG.getVTList(OpVT); if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS})) return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC); } return SDValue(); } static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); unsigned NumBits = VT.getScalarSizeInBits(); unsigned NumElts = SrcVT.getVectorNumElements(); unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits(); assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types"); // Perform constant folding. APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ true)) { APInt Imm(32, 0); for (unsigned Idx = 0; Idx != NumElts; ++Idx) if (!UndefElts[Idx] && EltBits[Idx].isNegative()) Imm.setBit(Idx); return DAG.getConstant(Imm, SDLoc(N), VT); } // Look through int->fp bitcasts that don't change the element width. unsigned EltWidth = SrcVT.getScalarSizeInBits(); if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST && Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results // with scalar comparisons. if (SDValue NotSrc = IsNOT(Src, DAG)) { SDLoc DL(N); APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); NotSrc = DAG.getBitcast(SrcVT, NotSrc); return DAG.getNode(ISD::XOR, DL, VT, DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc), DAG.getConstant(NotMask, DL, VT)); } // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk // results with scalar comparisons. if (Src.getOpcode() == X86ISD::PCMPGT && ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) { SDLoc DL(N); APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); return DAG.getNode(ISD::XOR, DL, VT, DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)), DAG.getConstant(NotMask, DL, VT)); } // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2)) // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2))) // iff pow2splat(c1). // Use KnownBits to determine if only a single bit is non-zero // in each element (pow2 or zero), and shift that bit to the msb. if (Src.getOpcode() == X86ISD::PCMPEQ) { KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0)); KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1)); unsigned ShiftAmt = KnownLHS.countMinLeadingZeros(); if (KnownLHS.countMaxPopulation() == 1 && (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 && ShiftAmt == KnownRHS.countMinLeadingZeros()))) { SDLoc DL(N); MVT ShiftVT = SrcVT; SDValue ShiftLHS = Src.getOperand(0); SDValue ShiftRHS = Src.getOperand(1); if (ShiftVT.getScalarType() == MVT::i8) { // vXi8 shifts - we only care about the signbit so can use PSLLW. ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS); ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS); } ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, ShiftLHS, ShiftAmt, DAG); ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, ShiftRHS, ShiftAmt, DAG); ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS); ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS); SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS); return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT)); } } // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C) if (N->isOnlyUserOf(Src.getNode())) { SDValue SrcBC = peekThroughOneUseBitcasts(Src); if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt, UndefElts, EltBits)) { APInt Mask = APInt::getZero(NumBits); for (unsigned Idx = 0; Idx != NumElts; ++Idx) { if (!UndefElts[Idx] && EltBits[Idx].isNegative()) Mask.setBit(Idx); } SDLoc DL(N); SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0)); SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc); return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk, DAG.getConstant(Mask, DL, VT)); } } } // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnes(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); unsigned NumBits = VT.getScalarSizeInBits(); // Simplify the inputs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getAllOnes(NumBits)); if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { auto *MemOp = cast(N); SDValue Mask = MemOp->getMask(); // With vector masks we only demand the upper bit of the mask. if (Mask.getScalarValueSizeInBits() != 1) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } } return SDValue(); } static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG) { SDLoc DL(GorS); if (auto *Gather = dyn_cast(GorS)) { SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(), Gather->getMask(), Base, Index, Scale } ; return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL, Ops, Gather->getMemOperand(), Gather->getIndexType(), Gather->getExtensionType()); } auto *Scatter = cast(GorS); SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), Scatter->getMask(), Base, Index, Scale }; return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL, Ops, Scatter->getMemOperand(), Scatter->getIndexType(), Scatter->isTruncatingStore()); } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); auto *GorS = cast(N); SDValue Index = GorS->getIndex(); SDValue Base = GorS->getBasePtr(); SDValue Scale = GorS->getScale(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (DCI.isBeforeLegalize()) { unsigned IndexWidth = Index.getScalarValueSizeInBits(); // Shrink constant indices if they are larger than 32-bits. // Only do this before legalize types since v2i64 could become v2i32. // FIXME: We could check that the type is legal if we're after legalize // types, but then we would need to construct test cases where that happens. // FIXME: We could support more than just constant vectors, but we need to // careful with costing. A truncate that can be optimized out would be fine. // Otherwise we might only want to create a truncate if it avoids a split. if (auto *BV = dyn_cast(Index)) { if (BV->isConstant() && IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32); Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } // Shrink any sign/zero extends from 32 or smaller to larger than 32 if // there are sufficient sign bits. Only do this before legalize types to // avoid creating illegal types in truncate. if ((Index.getOpcode() == ISD::SIGN_EXTEND || Index.getOpcode() == ISD::ZERO_EXTEND) && IndexWidth > 32 && Index.getOperand(0).getScalarValueSizeInBits() <= 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32); Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); // Try to move splat constant adders from the index operand to the base // pointer operand. Taking care to multiply by the scale. We can only do // this when index element type is the same as the pointer type. // Otherwise we need to be sure the math doesn't wrap before the scale. if (Index.getOpcode() == ISD::ADD && Index.getValueType().getVectorElementType() == PtrVT && isa(Scale)) { uint64_t ScaleAmt = Scale->getAsZExtVal(); if (auto *BV = dyn_cast(Index.getOperand(1))) { BitVector UndefElts; if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) { // FIXME: Allow non-constant? if (UndefElts.none()) { // Apply the scale. APInt Adder = C->getAPIntValue() * ScaleAmt; // Add it to the existing base. Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base, DAG.getConstant(Adder, DL, PtrVT)); Index = Index.getOperand(0); return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } // It's also possible base is just a constant. In that case, just // replace it with 0 and move the displacement into the index. if (BV->isConstant() && isa(Base) && isOneConstant(Scale)) { SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base); // Combine the constant build_vector and the constant base. Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(), Index.getOperand(1), Splat); // Add to the LHS of the original Index add. Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(), Index.getOperand(0), Splat); Base = DAG.getConstant(0, DL, Base.getValueType()); return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } } if (DCI.isBeforeLegalizeOps()) { unsigned IndexWidth = Index.getScalarValueSizeInBits(); // Make sure the index is either i32 or i64 if (IndexWidth != 32 && IndexWidth != 64) { MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } // With vector masks we only demand the upper bit of the mask. SDValue Mask = GorS->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { if (N->getOpcode() != ISD::DELETED_NODE) DCI.AddToWorklist(N); return SDValue(N, 0); } } return SDValue(); } // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) return getSETCC(CC, Flags, DL, DAG); return SDValue(); } /// Optimize branch condition evaluation. static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); // Try to simplify the EFLAGS and condition code operands. // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); } return SDValue(); } // TODO: Could we move this to DAGCombine? static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane // to optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); bool IsStrict = N->isStrictFPOpcode(); unsigned NumEltBits = VT.getScalarSizeInBits(); SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); if (!VT.isVector() || Op0.getOpcode() != ISD::AND || DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits || VT.getSizeInBits() != Op0.getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (auto *BV = dyn_cast(Op0.getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst; if (IsStrict) SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other}, {N->getOperand(0), SDValue(BV, 0)}); else SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); if (IsStrict) return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL); return Res; } return SDValue(); } /// If we are converting a value to floating-point, try to replace scalar /// truncate of an extracted vector element with a bitcast. This tries to keep /// the sequence on XMM registers rather than moving between vector and GPRs. static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { // TODO: This is currently only used by combineSIntToFP, but it is generalized // to allow being called by any similar cast opcode. // TODO: Consider merging this into lowering: vectorizeExtractedCast(). SDValue Trunc = N->getOperand(0); if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE) return SDValue(); SDValue ExtElt = Trunc.getOperand(0); if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isNullConstant(ExtElt.getOperand(1))) return SDValue(); EVT TruncVT = Trunc.getValueType(); EVT SrcVT = ExtElt.getValueType(); unsigned DestWidth = TruncVT.getSizeInBits(); unsigned SrcWidth = SrcVT.getSizeInBits(); if (SrcWidth % DestWidth != 0) return SDValue(); // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0) EVT SrcVecVT = ExtElt.getOperand(0).getValueType(); unsigned VecWidth = SrcVecVT.getSizeInBits(); unsigned NumElts = VecWidth / DestWidth; EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts); SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0)); SDLoc DL(N); SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT, BitcastVec, ExtElt.getOperand(1)); return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt); } static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { bool IsStrict = N->isStrictFPOpcode(); SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); // Using i16 as an intermediate type is a bad idea, unless we have HW support // for it. Therefore for type sizes equal or smaller than 32 just go with i32. // if hasFP16 support: // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16)) // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32)) // else // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32)) // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64)) if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) { unsigned ScalarSize = InVT.getScalarSizeInBits(); if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 || ScalarSize >= 64) return SDValue(); SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16 : ScalarSize < 32 ? MVT::i32 : MVT::i64, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32)) // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 && VT.getScalarType() != MVT::f16) { SDLoc dl(N); EVT DstVT = InVT.changeVectorElementType(MVT::i32); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. SDNodeFlags Flags = N->getFlags(); if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) { if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other}, {N->getOperand(0), Op0}); return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); } return SDValue(); } static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. bool IsStrict = N->isStrictFPOpcode(); if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) return Res; // Now move on to more general possibilities. SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); // Using i16 as an intermediate type is a bad idea, unless we have HW support // for it. Therefore for type sizes equal or smaller than 32 just go with i32. // if hasFP16 support: // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16)) // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32)) // else // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32)) // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64)) if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) { unsigned ScalarSize = InVT.getScalarSizeInBits(); if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 || ScalarSize >= 64) return SDValue(); SDLoc dl(N); EVT DstVT = EVT::getVectorVT(*DAG.getContext(), (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16 : ScalarSize < 32 ? MVT::i32 : MVT::i64, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 && VT.getScalarType() != MVT::f16) { SDLoc dl(N); EVT DstVT = InVT.changeVectorElementType(MVT::i32); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Without AVX512DQ we only support i64 to float scalar conversion. For both // vectors and scalars, see if we know that the upper bits are all the sign // bit, in which case we can truncate the input to i32 and convert from that. if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) { unsigned BitWidth = InVT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); if (NumSignBits >= (BitWidth - 31)) { EVT TruncVT = MVT::i32; if (InVT.isVector()) TruncVT = InVT.changeVectorElementType(TruncVT); SDLoc dl(N); if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); if (IsStrict) return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, {N->getOperand(0), Trunc}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); } // If we're after legalize and the type is v2i32 we need to shuffle and // use CVTSI2P. assert(InVT == MVT::v2i64 && "Unexpected VT!"); SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, { 0, 2, -1, -1 }); if (IsStrict) return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, {N->getOperand(0), Shuf}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf); } } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (!Subtarget.useSoftFloat() && Subtarget.hasX87() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast(Op0.getNode()); // This transformation is not supported if the result type is f16 or f128. if (VT == MVT::f16 || VT == MVT::f128) return SDValue(); // If we have AVX512DQ we can use packed conversion instructions unless // the VT is f80. if (Subtarget.hasDQI() && VT != MVT::f80) return SDValue(); if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) { std::pair Tmp = Subtarget.getTargetLowering()->BuildFILD( VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second); return Tmp.first; } } if (IsStrict) return SDValue(); if (SDValue V = combineToFPTruncExtElt(N, DAG)) return V; return SDValue(); } static bool needCarryOrOverflowFlag(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); for (const SDNode *User : Flags->uses()) { X86::CondCode CC; switch (User->getOpcode()) { default: // Be conservative. return true; case X86ISD::SETCC: case X86ISD::SETCC_CARRY: CC = (X86::CondCode)User->getConstantOperandVal(0); break; case X86ISD::BRCOND: case X86ISD::CMOV: CC = (X86::CondCode)User->getConstantOperandVal(2); break; } switch (CC) { // clang-format off default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: case X86::COND_O: case X86::COND_NO: case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return true; // clang-format on } } return false; } static bool onlyZeroFlagUsed(SDValue Flags) { assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!"); for (const SDNode *User : Flags->uses()) { unsigned CCOpNo; switch (User->getOpcode()) { default: // Be conservative. return false; case X86ISD::SETCC: case X86ISD::SETCC_CARRY: CCOpNo = 0; break; case X86ISD::BRCOND: case X86ISD::CMOV: CCOpNo = 2; break; } X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo); if (CC != X86::COND_E && CC != X86::COND_NE) return false; } return true; } static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // Only handle test patterns. if (!isNullConstant(N->getOperand(1))) return SDValue(); // If we have a CMP of a truncated binop, see if we can make a smaller binop // and use its flags directly. // TODO: Maybe we should try promoting compares that only use the zero flag // first if we can prove the upper bits with computeKnownBits? SDLoc dl(N); SDValue Op = N->getOperand(0); EVT VT = Op.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget)) return CMP; // If we have a constant logical shift that's only used in a comparison // against zero turn it into an equivalent AND. This allows turning it into // a TEST instruction later. if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && Op.hasOneUse() && isa(Op.getOperand(1)) && onlyZeroFlagUsed(SDValue(N, 0))) { unsigned BitWidth = VT.getSizeInBits(); const APInt &ShAmt = Op.getConstantOperandAPInt(1); if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts. unsigned MaskBits = BitWidth - ShAmt.getZExtValue(); APInt Mask = Op.getOpcode() == ISD::SRL ? APInt::getHighBitsSet(BitWidth, MaskBits) : APInt::getLowBitsSet(BitWidth, MaskBits); if (Mask.isSignedIntN(32)) { Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), DAG.getConstant(Mask, dl, VT)); return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, VT)); } } } // If we're extracting from a avx512 bool vector and comparing against zero, // then try to just bitcast the vector to an integer to use TEST/BT directly. // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<= 8 && TLI.isTypeLegal(SrcVT)) return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src, DAG.getConstant(0, dl, SrcVT)); } // Look for a truncate. if (Op.getOpcode() != ISD::TRUNCATE) return SDValue(); SDValue Trunc = Op; Op = Op.getOperand(0); // See if we can compare with zero against the truncation source, // which should help using the Z flag from many ops. Only do this for // i32 truncated op to prevent partial-reg compares of promoted ops. EVT OpVT = Op.getValueType(); APInt UpperBits = APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits()); if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) && onlyZeroFlagUsed(SDValue(N, 0))) { return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, OpVT)); } // After this the truncate and arithmetic op must have a single use. if (!Trunc.hasOneUse() || !Op.hasOneUse()) return SDValue(); unsigned NewOpc; switch (Op.getOpcode()) { default: return SDValue(); case ISD::AND: // Skip and with constant. We have special handling for and with immediate // during isel to generate test instructions. if (isa(Op.getOperand(1))) return SDValue(); NewOpc = X86ISD::AND; break; case ISD::OR: NewOpc = X86ISD::OR; break; case ISD::XOR: NewOpc = X86ISD::XOR; break; case ISD::ADD: // If the carry or overflow flag is used, we can't truncate. if (needCarryOrOverflowFlag(SDValue(N, 0))) return SDValue(); NewOpc = X86ISD::ADD; break; case ISD::SUB: // If the carry or overflow flag is used, we can't truncate. if (needCarryOrOverflowFlag(SDValue(N, 0))) return SDValue(); NewOpc = X86ISD::SUB; break; } // We found an op we can narrow. Truncate its inputs. SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0)); SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1)); // Use a X86 specific opcode to avoid DAG combine messing with it. SDVTList VTs = DAG.getVTList(VT, MVT::i32); Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1); // For AND, keep a CMP so that we can match the test pattern. if (NewOpc == X86ISD::AND) return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, dl, VT)); // Return the flags. return Op.getValue(1); } static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST) { assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"); SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); MVT VT = LHS.getSimpleValueType(); bool IsSub = X86ISD::SUB == N->getOpcode(); unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD; if (IsSub && isOneConstant(N->getOperand(1)) && !N->hasAnyUseOfValue(0)) if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST)) return CMP; // If we don't use the flag result, simplify back to a generic ADD/SUB. if (!N->hasAnyUseOfValue(1)) { SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS); return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL); } // Fold any similar generic ADD/SUB opcodes to reuse this node. auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { SDValue Ops[] = {N0, N1}; SDVTList VTs = DAG.getVTList(N->getValueType(0)); if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { SDValue Op(N, 0); if (Negate) Op = DAG.getNegative(Op, DL, VT); DCI.CombineTo(GenericAddSub, Op); } }; MatchGeneric(LHS, RHS, false); MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the // EFLAGS result doesn't change. return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG, /*ZeroSecondOpOnly*/ true); } static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue BorrowIn = N->getOperand(2); if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags); } // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) // iff the flag result is dead. if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) && !N->hasAnyUseOfValue(1)) return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0), LHS.getOperand(1), BorrowIn); return SDValue(); } // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue CarryIn = N->getOperand(2); auto *LHSC = dyn_cast(LHS); auto *RHSC = dyn_cast(RHS); // Canonicalize constant to RHS. if (LHSC && !RHSC) return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS, CarryIn); // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() && // We don't have a good way to replace an EFLAGS use, so only do this when // dead right now. SDValue(N, 1).use_empty()) { SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); SDValue Res1 = DAG.getNode( ISD::AND, DL, VT, DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn), DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry) // iff the flag result is dead. // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow. if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) { SDLoc DL(N); APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue(); return DAG.getNode(X86ISD::ADC, DL, N->getVTList(), DAG.getConstant(0, DL, LHS.getValueType()), DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn); } if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) { MVT VT = N->getSimpleValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags); } // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry) // iff the flag result is dead. if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() && !N->hasAnyUseOfValue(1)) return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0), LHS.getOperand(1), CarryIn); return SDValue(); } static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { // Example of pattern we try to detect: // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1)))) //(add (build_vector (extract_elt t, 0), // (extract_elt t, 2), // (extract_elt t, 4), // (extract_elt t, 6)), // (build_vector (extract_elt t, 1), // (extract_elt t, 3), // (extract_elt t, 5), // (extract_elt t, 7))) if (!Subtarget.hasSSE2()) return SDValue(); if (Op0.getOpcode() != ISD::BUILD_VECTOR || Op1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 || VT.getVectorNumElements() < 4 || !isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); // Check if one of Op0,Op1 is of the form: // (build_vector (extract_elt Mul, 0), // (extract_elt Mul, 2), // (extract_elt Mul, 4), // ... // the other is of the form: // (build_vector (extract_elt Mul, 1), // (extract_elt Mul, 3), // (extract_elt Mul, 5), // ... // and identify Mul. SDValue Mul; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) { SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i), Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1); // TODO: Be more tolerant to undefs. if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); auto *Const0L = dyn_cast(Op0L->getOperand(1)); auto *Const1L = dyn_cast(Op1L->getOperand(1)); auto *Const0H = dyn_cast(Op0H->getOperand(1)); auto *Const1H = dyn_cast(Op1H->getOperand(1)); if (!Const0L || !Const1L || !Const0H || !Const1H) return SDValue(); unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(), Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue(); // Commutativity of mul allows factors of a product to reorder. if (Idx0L > Idx1L) std::swap(Idx0L, Idx1L); if (Idx0H > Idx1H) std::swap(Idx0H, Idx1H); // Commutativity of add allows pairs of factors to reorder. if (Idx0L > Idx0H) { std::swap(Idx0L, Idx0H); std::swap(Idx1L, Idx1H); } if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 || Idx1H != 2 * i + 3) return SDValue(); if (!Mul) { // First time an extract_elt's source vector is visited. Must be a MUL // with 2X number of vector elements than the BUILD_VECTOR. // Both extracts must be from same MUL. Mul = Op0L->getOperand(0); if (Mul->getOpcode() != ISD::MUL || Mul.getValueType().getVectorNumElements() != 2 * e) return SDValue(); } // Check that the extract is from the same MUL previously seen. if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) || Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0)) return SDValue(); } // Check if the Mul source can be safely shrunk. ShrinkMode Mode; if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == ShrinkMode::MULU16) return SDValue(); EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements() * 2); SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0)); SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1)); auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { EVT InVT = Ops[0].getValueType(); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder); } // Attempt to turn this pattern into PMADDWD. // (add (mul (sext (build_vector)), (sext (build_vector))), // (mul (sext (build_vector)), (sext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { if (!Subtarget.hasSSE2()) return SDValue(); if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) return SDValue(); if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 || VT.getVectorNumElements() < 4 || !isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); // All inputs need to be sign extends. // TODO: Support ZERO_EXTEND from known positive? if (N00.getOpcode() != ISD::SIGN_EXTEND || N01.getOpcode() != ISD::SIGN_EXTEND || N10.getOpcode() != ISD::SIGN_EXTEND || N11.getOpcode() != ISD::SIGN_EXTEND) return SDValue(); // Peek through the extends. N00 = N00.getOperand(0); N01 = N01.getOperand(0); N10 = N10.getOperand(0); N11 = N11.getOperand(0); // Must be extending from vXi16. EVT InVT = N00.getValueType(); if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT || N10.getValueType() != InVT || N11.getValueType() != InVT) return SDValue(); // All inputs should be build_vectors. if (N00.getOpcode() != ISD::BUILD_VECTOR || N01.getOpcode() != ISD::BUILD_VECTOR || N10.getOpcode() != ISD::BUILD_VECTOR || N11.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // For each element, we need to ensure we have an odd element from one vector // multiplied by the odd element of another vector and the even element from // one of the same vectors being multiplied by the even element from the // other vector. So we need to make sure for each element i, this operator // is being performed: // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] SDValue In0, In1; for (unsigned i = 0; i != N00.getNumOperands(); ++i) { SDValue N00Elt = N00.getOperand(i); SDValue N01Elt = N01.getOperand(i); SDValue N10Elt = N10.getOperand(i); SDValue N11Elt = N11.getOperand(i); // TODO: Be more tolerant to undefs. if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); auto *ConstN00Elt = dyn_cast(N00Elt.getOperand(1)); auto *ConstN01Elt = dyn_cast(N01Elt.getOperand(1)); auto *ConstN10Elt = dyn_cast(N10Elt.getOperand(1)); auto *ConstN11Elt = dyn_cast(N11Elt.getOperand(1)); if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) return SDValue(); unsigned IdxN00 = ConstN00Elt->getZExtValue(); unsigned IdxN01 = ConstN01Elt->getZExtValue(); unsigned IdxN10 = ConstN10Elt->getZExtValue(); unsigned IdxN11 = ConstN11Elt->getZExtValue(); // Add is commutative so indices can be reordered. if (IdxN00 > IdxN10) { std::swap(IdxN00, IdxN10); std::swap(IdxN01, IdxN11); } // N0 indices be the even element. N1 indices must be the next odd element. if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i || IdxN11 != 2 * i + 1) return SDValue(); SDValue N00In = N00Elt.getOperand(0); SDValue N01In = N01Elt.getOperand(0); SDValue N10In = N10Elt.getOperand(0); SDValue N11In = N11Elt.getOperand(0); // First time we find an input capture it. if (!In0) { In0 = N00In; In1 = N01In; // The input vectors must be at least as wide as the output. // If they are larger than the output, we extract subvector below. if (In0.getValueSizeInBits() < VT.getSizeInBits() || In1.getValueSizeInBits() < VT.getSizeInBits()) return SDValue(); } // Mul is commutative so the input vectors can be in any order. // Canonicalize to make the compares easier. if (In0 != N00In) std::swap(N00In, N01In); if (In0 != N10In) std::swap(N10In, N11In); if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In) return SDValue(); } auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { EVT OpVT = Ops[0].getValueType(); assert(OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"); assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, OpVT.getVectorNumElements() / 2); return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; // If the output is narrower than an input, extract the low part of the input // vector. EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements() * 2); if (OutVT16.bitsLT(In0.getValueType())) { In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0, DAG.getIntPtrConstant(0, DL)); } if (OutVT16.bitsLT(In1.getValueType())) { In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1, DAG.getIntPtrConstant(0, DL)); } return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, PMADDBuilder); } // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W)) // If upper element in each pair of both VPMADDWD are zero then we can merge // the operand elements and use the implicit add of VPMADDWD. // TODO: Add support for VPMADDUBSW (which isn't commutable). static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT) { if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD) return SDValue(); // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles. if (VT.getSizeInBits() > 128) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); MVT OpVT = N0.getOperand(0).getSimpleValueType(); APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits()); APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2)); bool Op0HiZero = DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) || DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts); bool Op1HiZero = DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) || DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts); // TODO: Check for zero lower elements once we have actual codegen that // creates them. if (!Op0HiZero || !Op1HiZero) return SDValue(); // Create a shuffle mask packing the lower elements from each VPMADDWD. SmallVector Mask; for (int i = 0; i != (int)NumElts; ++i) { Mask.push_back(2 * i); Mask.push_back(2 * (i + NumElts)); } SDValue LHS = DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask); SDValue RHS = DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask); return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS); } /// CMOV of constants requires materializing constant operands in registers. /// Try to fold those constants into an 'add' instruction to reduce instruction /// count. We do this with CMOV rather the generic 'select' because there are /// earlier folds that may be used to turn select-of-constants into logic hacks. static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If an operand is zero, add-of-0 gets simplified away, so that's clearly // better because we eliminate 1-2 instructions. This transform is still // an improvement without zero operands because we trade 2 move constants and // 1 add for 2 adds (LEA) as long as the constants can be represented as // immediate asm operands (fit in 32-bits). auto isSuitableCmov = [](SDValue V) { if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse()) return false; if (!isa(V.getOperand(0)) || !isa(V.getOperand(1))) return false; return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) || (V.getConstantOperandAPInt(0).isSignedIntN(32) && V.getConstantOperandAPInt(1).isSignedIntN(32)); }; // Match an appropriate CMOV as the first operand of the add. SDValue Cmov = N->getOperand(0); SDValue OtherOp = N->getOperand(1); if (!isSuitableCmov(Cmov)) std::swap(Cmov, OtherOp); if (!isSuitableCmov(Cmov)) return SDValue(); // Don't remove a load folding opportunity for the add. That would neutralize // any improvements from removing constant materializations. if (X86::mayFoldLoad(OtherOp, Subtarget)) return SDValue(); EVT VT = N->getValueType(0); SDValue FalseOp = Cmov.getOperand(0); SDValue TrueOp = Cmov.getOperand(1); // We will push the add through the select, but we can potentially do better // if we know there is another add in the sequence and this is pointer math. // In that case, we can absorb an add into the trailing memory op and avoid // a 3-operand LEA which is likely slower than a 2-operand LEA. // TODO: If target has "slow3OpsLEA", do this even without the trailing memop? if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() && !isa(OtherOp.getOperand(0)) && all_of(N->uses(), [&](SDNode *Use) { auto *MemNode = dyn_cast(Use); return MemNode && MemNode->getBasePtr().getNode() == N; })) { // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but // it is possible that choosing op1 might be better. SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1); FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp); TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp); Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2), Cmov.getOperand(3)); return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y); } // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2) FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp); TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp); return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2), Cmov.getOperand(3)); } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget)) return Select; if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget)) return MAdd; if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget)) return MAdd; if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT)) return MAdd; // Try to synthesize horizontal adds from adds of shuffles. if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget)) return V; // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0) // iff X and Y won't overflow. if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW && ISD::isBuildVectorAllZeros(Op0.getOperand(1).getNode()) && ISD::isBuildVectorAllZeros(Op1.getOperand(1).getNode())) { if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) { MVT OpVT = Op0.getOperand(1).getSimpleValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0)); return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum, getZeroVector(OpVT, Subtarget, DAG, DL)); } } // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into // (sub Y, (sext (vXi1 X))). // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in // generic DAG combine without a legal type check, but adding this there // caused regressions. if (VT.isVector()) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Op0.getOpcode() == ISD::ZERO_EXTEND && Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && TLI.isTypeLegal(Op0.getOperand(0).getValueType())) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt); } if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && TLI.isTypeLegal(Op1.getOperand(0).getValueType())) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt); } } // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W) if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() && X86::isZeroNode(Op0.getOperand(1))) { assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use"); return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1, Op0.getOperand(0), Op0.getOperand(2)); } return combineAddOrSubToADCOrSBB(N, DL, DAG); } // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov // condition comes from the subtract node that produced -X. This matches the // cmov expansion for absolute value. By swapping the operands we convert abs // to nabs. static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse()) return SDValue(); X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2); if (CC != X86::COND_S && CC != X86::COND_NS) return SDValue(); // Condition should come from a negate operation. SDValue Cond = N1.getOperand(3); if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0))) return SDValue(); assert(Cond.getResNo() == 1 && "Unexpected result number"); // Get the X and -X from the negate. SDValue NegX = Cond.getValue(0); SDValue X = Cond.getOperand(1); SDValue FalseOp = N1.getOperand(0); SDValue TrueOp = N1.getOperand(1); // Cmov operands should be X and NegX. Order doesn't matter. if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X)) return SDValue(); // Build a new CMOV with the operands swapped. SDLoc DL(N); MVT VT = N->getSimpleValueType(0); SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2), Cond); // Convert sub to add. return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov); } static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); // (sub C (zero_extend (setcc))) // => // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate // Don't disturb (sub 0 setcc), which is easily done with neg. EVT VT = N->getValueType(0); auto *Op0C = dyn_cast(Op0); if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C && !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC && Op1.getOperand(0).hasOneUse()) { SDValue SetCC = Op1.getOperand(0); X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC); APInt NewImm = Op0C->getAPIntValue() - 1; SDLoc DL(Op1); SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG); NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC); return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC, DAG.getConstant(NewImm, DL, VT)); } return SDValue(); } static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) { // res, flags2 = sub 0, (setcc cc, flag) // cload/cstore ..., cond_ne, flag2 // -> // cload/cstore cc, flag if (N->getConstantOperandVal(3) != X86::COND_NE) return SDValue(); SDValue Sub = N->getOperand(4); if (Sub.getOpcode() != X86ISD::SUB) return SDValue(); SDValue SetCC = Sub.getOperand(1); if (!X86::isZeroNode(Sub.getOperand(0)) || SetCC.getOpcode() != X86ISD::SETCC) return SDValue(); SmallVector Ops(N->op_values()); Ops[3] = SetCC.getOperand(0); Ops[4] = SetCC.getOperand(1); return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops, cast(N)->getMemoryVT(), cast(N)->getMemOperand()); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDLoc DL(N); // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. auto IsNonOpaqueConstant = [&](SDValue Op) { if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) { if (auto *Cst = dyn_cast(C)) return !Cst->isOpaque(); return true; } return false; }; // X86 can't encode an immediate LHS of a sub. See if we can push the // negation into a preceding instruction. If the RHS of the sub is a XOR with // one use and a constant, invert the immediate, saving one register. // However, ignore cases where C1 is 0, as those will become a NEG. // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1) if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) && !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) { EVT VT = Op0.getValueType(); SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT)); SDValue NewAdd = DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT)); return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd); } if (SDValue V = combineSubABS(N, DAG)) return V; // Try to synthesize horizontal subs from subs of shuffles. if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget)) return V; // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W) if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() && X86::isZeroNode(Op1.getOperand(1))) { assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use"); return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0, Op1.getOperand(0), Op1.getOperand(2)); } // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y) // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds. if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() && !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) { assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use"); SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0, Op1.getOperand(1), Op1.getOperand(2)); return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0), Op1.getOperand(0)); } if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget)) return V; if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG)) return V; return combineSubSetcc(N, DAG); } static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) && "Unknown PCMP opcode"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); MVT VT = N->getSimpleValueType(0); unsigned EltBits = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); SDLoc DL(N); if (LHS == RHS) return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT) : DAG.getConstant(0, DL, VT); // Constant Folding. // PCMPEQ(X,UNDEF) -> UNDEF // PCMPGT(X,UNDEF) -> 0 // PCMPGT(UNDEF,X) -> 0 APInt LHSUndefs, RHSUndefs; SmallVector LHSBits, RHSBits; if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) && getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) { APInt Ones = APInt::getAllOnes(EltBits); APInt Zero = APInt::getZero(EltBits); SmallVector Results(NumElts); for (unsigned I = 0; I != NumElts; ++I) { if (Opcode == X86ISD::PCMPEQ) { Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero; } else { bool AnyUndef = LHSUndefs[I] || RHSUndefs[I]; Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero; } } if (Opcode == X86ISD::PCMPEQ) return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL); return getConstVector(Results, VT, DAG, DL); } return SDValue(); } // Helper to determine if we can convert an integer comparison to a float // comparison byt casting the operands. static std::optional CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS) { MVT SVT = VT.getScalarType(); assert(SVT == MVT::f32 && "Only tested for float so far"); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(SVT); assert((CC == ISD::SETEQ || CC == ISD::SETGT) && "Only PCMPEQ/PCMPGT currently supported"); // TODO: Handle bitcastable integers. // For cvt + signed compare we need lhs and rhs to be exactly representable as // a fp value. unsigned FPPrec = APFloat::semanticsPrecision(Sem); if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS) return ISD::SINT_TO_FP; return std::nullopt; } /// Helper that combines an array of subvector ops as if they were the operands /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g. /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type. static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) return DAG.getUNDEF(VT); if (llvm::all_of(Ops, [](SDValue Op) { return ISD::isBuildVectorAllZeros(Op.getNode()); })) return getZeroVector(VT, Subtarget, DAG, DL); SDValue Op0 = Ops[0]; bool IsSplat = llvm::all_equal(Ops); unsigned NumOps = Ops.size(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext &Ctx = *DAG.getContext(); // Repeated subvectors. if (IsSplat && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { // If this broadcast is inserted into both halves, use a larger broadcast. if (Op0.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && (Subtarget.hasAVX2() || X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0), VT.getScalarType(), Subtarget))) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, Op0.getOperand(0), DAG.getIntPtrConstant(0, DL))); // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && (Subtarget.hasAVX2() || (EltSizeInBits >= 32 && X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) && Op0.getOperand(0).getValueType() == VT.getScalarType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); // concat_vectors(extract_subvector(broadcast(x)), // extract_subvector(broadcast(x))) -> broadcast(x) // concat_vectors(extract_subvector(subv_broadcast(x)), // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x) if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op0.getOperand(0).getValueType() == VT) { SDValue SrcVec = Op0.getOperand(0); if (SrcVec.getOpcode() == X86ISD::VBROADCAST || SrcVec.getOpcode() == X86ISD::VBROADCAST_LOAD) return Op0.getOperand(0); if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && Op0.getValueType() == cast(SrcVec)->getMemoryVT()) return Op0.getOperand(0); } // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x)) if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() && !X86::mayFoldLoad(Op0.getOperand(0), Subtarget)) return DAG.getNode(Op0.getOpcode(), DL, VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0.getOperand(0), Op0.getOperand(0)), Op0.getOperand(1)); } // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. // Only concat of subvector high halves which vperm2x128 is best at. // TODO: This should go in combineX86ShufflesRecursively eventually. if (VT.is256BitVector() && NumOps == 2) { SDValue Src0 = peekThroughBitcasts(Ops[0]); SDValue Src1 = peekThroughBitcasts(Ops[1]); if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR && Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { EVT SrcVT0 = Src0.getOperand(0).getValueType(); EVT SrcVT1 = Src1.getOperand(0).getValueType(); unsigned NumSrcElts0 = SrcVT0.getVectorNumElements(); unsigned NumSrcElts1 = SrcVT1.getVectorNumElements(); if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() && Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) && Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) { return DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getBitcast(VT, Src0.getOperand(0)), DAG.getBitcast(VT, Src1.getOperand(0)), DAG.getTargetConstant(0x31, DL, MVT::i8)); } } } // Repeated opcode. // TODO - combineX86ShufflesRecursively should handle shuffle concatenation // but it currently struggles with different vector widths. if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse(); })) { auto ConcatSubOperand = [&](EVT VT, ArrayRef SubOps, unsigned I) { SmallVector Subs; for (SDValue SubOp : SubOps) Subs.push_back(SubOp.getOperand(I)); // Attempt to peek through bitcasts and concat the original subvectors. EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType(); if (SubVT.isSimple() && SubVT.isVector()) { EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), SubVT.getScalarType(), SubVT.getVectorElementCount() * Subs.size()); for (SDValue &Sub : Subs) Sub = DAG.getBitcast(SubVT, Sub); return DAG.getBitcast( VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); }; auto IsConcatFree = [](MVT VT, ArrayRef SubOps, unsigned Op) { bool AllConstants = true; bool AllSubVectors = true; for (unsigned I = 0, E = SubOps.size(); I != E; ++I) { SDValue Sub = SubOps[I].getOperand(Op); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); SDValue BC = peekThroughBitcasts(Sub); AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode()); AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && Sub.getOperand(0).getValueType() == VT && Sub.getConstantOperandAPInt(1) == (I * NumSubElts); } return AllConstants || AllSubVectors; }; switch (Op0.getOpcode()) { case X86ISD::VBROADCAST: { if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) { return Op.getOperand(0).getValueType().is128BitVector(); })) { if (VT == MVT::v4f64 || VT == MVT::v4i64) return DAG.getNode(X86ISD::UNPCKL, DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 0)); // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets. if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256())) return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI : X86ISD::PSHUFD, DL, VT, ConcatSubOperand(VT, Ops, 0), getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG)); } break; } case X86ISD::MOVDDUP: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: { if (!IsSplat) return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0)); break; } case X86ISD::SHUFP: { // Add SHUFPD support if/when necessary. if (!IsSplat && VT.getScalarType() == MVT::f32 && llvm::all_of(Ops, [Op0](SDValue Op) { return Op.getOperand(2) == Op0.getOperand(2); })) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); } break; } case X86ISD::UNPCKH: case X86ISD::UNPCKL: { // Don't concatenate build_vector patterns. if (!IsSplat && EltSizeInBits >= 32 && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useAVX512Regs())) && none_of(Ops, [](SDValue Op) { return peekThroughBitcasts(Op.getOperand(0)).getOpcode() == ISD::SCALAR_TO_VECTOR || peekThroughBitcasts(Op.getOperand(1)).getOpcode() == ISD::SCALAR_TO_VECTOR; })) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); } break; } case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::PSHUFD: if (!IsSplat && NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1)); } [[fallthrough]]; case X86ISD::VPERMILPI: if (!IsSplat && EltSizeInBits == 32 && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.useAVX512Regs())) && all_of(Ops, [&Op0](SDValue Op) { return Op0.getOperand(1) == Op.getOperand(1); })) { MVT FloatVT = VT.changeVectorElementType(MVT::f32); SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0)); Res = DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1)); return DAG.getBitcast(VT, Res); } if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) { uint64_t Idx0 = Ops[0].getConstantOperandVal(1); uint64_t Idx1 = Ops[1].getConstantOperandVal(1); uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3); return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), DAG.getTargetConstant(Idx, DL, MVT::i8)); } break; case X86ISD::PSHUFB: case X86ISD::PSADBW: case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useBWIRegs()))) { MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), NumOps * SrcVT.getVectorNumElements()); return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(SrcVT, Ops, 0), ConcatSubOperand(SrcVT, Ops, 1)); } break; case X86ISD::VPERMV: if (!IsSplat && NumOps == 2 && (VT.is512BitVector() && Subtarget.useAVX512Regs())) { MVT OpVT = Op0.getSimpleValueType(); int NumSrcElts = OpVT.getVectorNumElements(); SmallVector ConcatMask; for (unsigned i = 0; i != NumOps; ++i) { SmallVector SubMask; SmallVector SubOps; if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask)) break; for (int M : SubMask) { if (0 <= M) M += i * NumSrcElts; ConcatMask.push_back(M); } } if (ConcatMask.size() == (NumOps * NumSrcElts)) { SDValue Src = concatSubVectors(Ops[0].getOperand(1), Ops[1].getOperand(1), DAG, DL); MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits); MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts); SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src); } } break; case X86ISD::VPERMV3: if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { MVT OpVT = Op0.getSimpleValueType(); int NumSrcElts = OpVT.getVectorNumElements(); SmallVector ConcatMask; for (unsigned i = 0; i != NumOps; ++i) { SmallVector SubMask; SmallVector SubOps; if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask)) break; for (int M : SubMask) { if (0 <= M) { M += M < NumSrcElts ? 0 : NumSrcElts; M += i * NumSrcElts; } ConcatMask.push_back(M); } } if (ConcatMask.size() == (NumOps * NumSrcElts)) { SDValue Src0 = concatSubVectors(Ops[0].getOperand(0), Ops[1].getOperand(0), DAG, DL); SDValue Src1 = concatSubVectors(Ops[0].getOperand(2), Ops[1].getOperand(2), DAG, DL); MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits); MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts); SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1); } } break; case X86ISD::VPERM2X128: { if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) { assert(NumOps == 2 && "Bad concat_vectors operands"); unsigned Imm0 = Ops[0].getConstantOperandVal(2); unsigned Imm1 = Ops[1].getConstantOperandVal(2); // TODO: Handle zero'd subvectors. if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) { int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03), (int)((Imm1 >> 4) & 0x3)}; MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64; SDValue LHS = concatSubVectors(Ops[0].getOperand(0), Ops[0].getOperand(1), DAG, DL); SDValue RHS = concatSubVectors(Ops[1].getOperand(0), Ops[1].getOperand(1), DAG, DL); SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, DAG.getBitcast(ShuffleVT, LHS), DAG.getBitcast(ShuffleVT, RHS), getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); return DAG.getBitcast(VT, Res); } } break; } case X86ISD::SHUF128: { if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { unsigned Imm0 = Ops[0].getConstantOperandVal(2); unsigned Imm1 = Ops[1].getConstantOperandVal(2); unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 | ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80; SDValue LHS = concatSubVectors(Ops[0].getOperand(0), Ops[0].getOperand(1), DAG, DL); SDValue RHS = concatSubVectors(Ops[1].getOperand(0), Ops[1].getOperand(1), DAG, DL); return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS, DAG.getTargetConstant(Imm, DL, MVT::i8)); } break; } case ISD::TRUNCATE: if (!IsSplat && NumOps == 2 && VT.is256BitVector()) { EVT SrcVT = Ops[0].getOperand(0).getValueType(); if (SrcVT.is256BitVector() && SrcVT.isSimple() && SrcVT == Ops[1].getOperand(0).getValueType() && Subtarget.useAVX512Regs() && Subtarget.getPreferVectorWidth() >= 512 && (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) { EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx); return DAG.getNode(ISD::TRUNCATE, DL, VT, ConcatSubOperand(NewSrcVT, Ops, 0)); } } break; case X86ISD::VSHLI: case X86ISD::VSRLI: // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. // TODO: Move this to LowerShiftByScalarImmediate? if (VT == MVT::v4i64 && !Subtarget.hasInt256() && llvm::all_of(Ops, [](SDValue Op) { return Op.getConstantOperandAPInt(1) == 32; })) { SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0)); SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL); if (Op0.getOpcode() == X86ISD::VSHLI) { Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero, {8, 0, 8, 2, 8, 4, 8, 6}); } else { Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero, {1, 8, 3, 8, 5, 8, 7, 8}); } return DAG.getBitcast(VT, Res); } [[fallthrough]]; case X86ISD::VSRAI: case X86ISD::VSHL: case X86ISD::VSRL: case X86ISD::VSRA: if (((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useAVX512Regs() && (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && llvm::all_of(Ops, [Op0](SDValue Op) { return Op0.getOperand(1) == Op.getOperand(1); })) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1)); } break; case X86ISD::VPERMI: case X86ISD::VROTLI: case X86ISD::VROTRI: if (VT.is512BitVector() && Subtarget.useAVX512Regs() && llvm::all_of(Ops, [Op0](SDValue Op) { return Op0.getOperand(1) == Op.getOperand(1); })) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1)); } break; case ISD::AND: case ISD::OR: case ISD::XOR: case X86ISD::ANDNP: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); } break; case X86ISD::PCMPEQ: case X86ISD::PCMPGT: if (!IsSplat && VT.is256BitVector() && (Subtarget.hasInt256() || VT == MVT::v8i32) && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) { if (Subtarget.hasInt256()) return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); // Without AVX2, see if we can cast the values to v8f32 and use fcmp. // TODO: Handle v4f64 as well? unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0; for (unsigned I = 0; I != NumOps; ++I) { MaxSigBitsLHS = std::max(MaxSigBitsLHS, DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0))); MaxSigBitsRHS = std::max(MaxSigBitsRHS, DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1))); if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits) break; } ISD::CondCode ICC = Op0.getOpcode() == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT; ISD::CondCode FCC = Op0.getOpcode() == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT; MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits); MVT FpVT = VT.changeVectorElementType(FpSVT); if (std::optional CastOpc = CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) { SDValue LHS = ConcatSubOperand(VT, Ops, 0); SDValue RHS = ConcatSubOperand(VT, Ops, 1); LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS); RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS); bool IsAlwaysSignaling; unsigned FSETCC = translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling); return DAG.getBitcast( VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS, DAG.getTargetConstant(FSETCC, DL, MVT::i8))); } } break; case ISD::CTPOP: case ISD::CTTZ: case ISD::CTLZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ_ZERO_UNDEF: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useBWIRegs()))) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0)); } break; case X86ISD::GF2P8AFFINEQB: if (!IsSplat && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.useAVX512Regs())) && llvm::all_of(Ops, [Op0](SDValue Op) { return Op0.getOperand(2) == Op.getOperand(2); })) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); } break; case ISD::ADD: case ISD::SUB: case ISD::MUL: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useAVX512Regs() && (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); } break; // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and // their latency are short, so here we don't replace them unless we won't // introduce extra VINSERT. case ISD::FADD: case ISD::FSUB: case ISD::FMUL: if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); } break; case ISD::FDIV: if (!IsSplat && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); } break; case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: if (!IsSplat && VT.is256BitVector() && (VT.isFloatingPoint() || Subtarget.hasInt256())) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1)); } break; case X86ISD::PACKSS: case X86ISD::PACKUS: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useBWIRegs()))) { MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), NumOps * SrcVT.getVectorNumElements()); return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(SrcVT, Ops, 0), ConcatSubOperand(SrcVT, Ops, 1)); } break; case X86ISD::PALIGNR: if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || (VT.is512BitVector() && Subtarget.useBWIRegs())) && llvm::all_of(Ops, [Op0](SDValue Op) { return Op0.getOperand(2) == Op.getOperand(2); })) { return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(VT, Ops, 0), ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); } break; case X86ISD::BLENDI: if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) { uint64_t Mask0 = Ops[0].getConstantOperandVal(2); uint64_t Mask1 = Ops[1].getConstantOperandVal(2); // MVT::v16i16 has repeated blend mask. if (Op0.getSimpleValueType() == MVT::v16i16) { Mask0 = (Mask0 << 8) | Mask0; Mask1 = (Mask1 << 8) | Mask1; } uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0; MVT MaskSVT = MVT::getIntegerVT(VT.getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Sel = DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT)); return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 0)); } break; case ISD::VSELECT: if (!IsSplat && Subtarget.hasAVX512() && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.useAVX512Regs())) && (EltSizeInBits >= 32 || Subtarget.hasBWI())) { EVT SelVT = Ops[0].getOperand(0).getValueType(); if (SelVT.getVectorElementType() == MVT::i1) { SelVT = EVT::getVectorVT(Ctx, MVT::i1, NumOps * SelVT.getVectorNumElements()); if (TLI.isTypeLegal(SelVT)) return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2)); } } [[fallthrough]]; case X86ISD::BLENDV: if (!IsSplat && VT.is256BitVector() && NumOps == 2 && (EltSizeInBits >= 32 || Subtarget.hasInt256()) && IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) { EVT SelVT = Ops[0].getOperand(0).getValueType(); SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx); if (TLI.isTypeLegal(SelVT)) return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2)); } break; } } // Fold subvector loads into one. // If needed, look through bitcasts to get to the load. if (auto *FirstLd = dyn_cast(peekThroughBitcasts(Op0))) { unsigned Fast; const X86TargetLowering *TLI = Subtarget.getTargetLowering(); if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT, *FirstLd->getMemOperand(), &Fast) && Fast) { if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) return Ld; } } // Attempt to fold target constant loads. if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) { SmallVector EltBits; APInt UndefElts = APInt::getZero(VT.getVectorNumElements()); for (unsigned I = 0; I != NumOps; ++I) { APInt OpUndefElts; SmallVector OpEltBits; if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts, OpEltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)) break; EltBits.append(OpEltBits); UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth()); } if (EltBits.size() == VT.getVectorNumElements()) { Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue CV = DAG.getConstantPool(C, PVT); MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI); SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits()); DAG.ReplaceAllUsesOfValueWith(Op0, Sub); return Ld; } } // If this simple subvector or scalar/subvector broadcast_load is inserted // into both halves, use a larger broadcast_load. Update other uses to use // an extracted subvector. if (IsSplat && (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { if (ISD::isNormalLoad(Op0.getNode()) || Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { auto *Mem = cast(Op0); unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ? X86ISD::VBROADCAST_LOAD : X86ISD::SUBV_BROADCAST_LOAD; if (SDValue BcastLd = getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) { SDValue BcastSrc = extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()); DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc); return BcastLd; } } } // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly. if (IsSplat && NumOps == 4 && VT.is512BitVector() && Subtarget.useAVX512Regs()) { MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64; SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512); Res = DAG.getBitcast(ShuffleVT, Res); Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res, getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG)); return DAG.getBitcast(VT, Res); } return SDValue(); } static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); EVT SrcVT = N->getOperand(0).getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SmallVector Ops(N->op_begin(), N->op_end()); if (VT.getVectorElementType() == MVT::i1) { // Attempt to constant fold. unsigned SubSizeInBits = SrcVT.getSizeInBits(); APInt Constant = APInt::getZero(VT.getSizeInBits()); for (unsigned I = 0, E = Ops.size(); I != E; ++I) { auto *C = dyn_cast(peekThroughBitcasts(Ops[I])); if (!C) break; Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits); if (I == (E - 1)) { EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); if (TLI.isTypeLegal(IntVT)) return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT)); } } // Don't do anything else for i1 vectors. return SDValue(); } if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) { if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, DCI, Subtarget)) return R; } return SDValue(); } static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT OpVT = N->getSimpleValueType(0); bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1; SDLoc dl(N); SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); uint64_t IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); if (Vec.isUndef() && SubVec.isUndef()) return DAG.getUNDEF(OpVT); // Inserting undefs/zeros into zeros/undefs is a zero vector. if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) && (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode()))) return getZeroVector(OpVT, Subtarget, DAG, dl); if (ISD::isBuildVectorAllZeros(Vec.getNode())) { // If we're inserting into a zero vector and then into a larger zero vector, // just insert into the larger zero vector directly. if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { uint64_t Idx2Val = SubVec.getConstantOperandVal(2); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVec.getOperand(1), DAG.getIntPtrConstant(IdxVal + Idx2Val, dl)); } // If we're inserting into a zero vector and our input was extracted from an // insert into a zero vector of the same type and the extraction was at // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && isNullConstant(SubVec.getOperand(1)) && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); if (isNullConstant(Ins.getOperand(2)) && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits().getFixedValue() <= SubVecVT.getFixedSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), Ins.getOperand(1), N->getOperand(2)); } } // Stop here if this is an i1 vector. if (IsI1Vector) return SDValue(); // Eliminate an intermediate vector widening: // insert_subvector X, (insert_subvector undef, Y, 0), Idx --> // insert_subvector X, Y, Idx // TODO: This is a more general version of a DAGCombiner fold, can we move it // there? if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2))) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec.getOperand(1), N->getOperand(2)); // If this is an insert of an extract, combine to a shuffle. Don't do this // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && (IdxVal != 0 || !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) { int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { int VecNumElts = OpVT.getVectorNumElements(); int SubVecNumElts = SubVecVT.getVectorNumElements(); SmallVector Mask(VecNumElts); // First create an identity shuffle mask. for (int i = 0; i != VecNumElts; ++i) Mask[i] = i; // Now insert the extracted portion. for (int i = 0; i != SubVecNumElts; ++i) Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); } } // Match concat_vector style patterns. SmallVector SubVectorOps; if (collectConcatOps(N, SubVectorOps, DAG)) { if (SDValue Fold = combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) return Fold; // If we're inserting all zeros into the upper half, change this to // a concat with zero. We will match this to a move // with implicit upper bit zeroing during isel. // We do this here because we don't want combineConcatVectorOps to // create INSERT_SUBVECTOR from CONCAT_VECTORS. if (SubVectorOps.size() == 2 && ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode())) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, getZeroVector(OpVT, Subtarget, DAG, dl), SubVectorOps[0], DAG.getIntPtrConstant(0, dl)); // Attempt to recursively combine to a shuffle. if (all_of(SubVectorOps, [](SDValue SubOp) { return isTargetShuffle(SubOp.getOpcode()); })) { SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } } // If this is a broadcast insert into an upper undef, use a larger broadcast. if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); // If this is a broadcast load inserted into an upper undef, use a larger // broadcast load. if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() && SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) { auto *MemIntr = cast(SubVec); SDVTList Tys = DAG.getVTList(OpVT, MVT::Other); SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); return BcastLd; } // If we're splatting the lower half subvector of a full vector load into the // upper half, attempt to create a subvector broadcast. if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() && Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) { auto *VecLd = dyn_cast(Vec); auto *SubLd = dyn_cast(SubVec); if (VecLd && SubLd && DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT, SubLd, 0, DAG); } return SDValue(); } /// If we are extracting a subvector of a vector select and the select condition /// is composed of concatenated vectors, try to narrow the select width. This /// is a common pattern for AVX1 integer code because 256-bit selects may be /// legal, but there is almost no integer math/logic available for 256-bit. /// This function should only be called with legal types (otherwise, the calls /// to get simple value types will assert). static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG) { SDValue Sel = Ext->getOperand(0); if (Sel.getOpcode() != ISD::VSELECT || !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG)) return SDValue(); // Note: We assume simple value types because this should only be called with // legal operations/types. // TODO: This can be extended to handle extraction to 256-bits. MVT VT = Ext->getSimpleValueType(0); if (!VT.is128BitVector()) return SDValue(); MVT SelCondVT = Sel.getOperand(0).getSimpleValueType(); if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector()) return SDValue(); MVT WideVT = Ext->getOperand(0).getSimpleValueType(); MVT SelVT = Sel.getSimpleValueType(); assert((SelVT.is256BitVector() || SelVT.is512BitVector()) && "Unexpected vector type with legal operations"); unsigned SelElts = SelVT.getVectorNumElements(); unsigned CastedElts = WideVT.getVectorNumElements(); unsigned ExtIdx = Ext->getConstantOperandVal(1); if (SelElts % CastedElts == 0) { // The select has the same or more (narrower) elements than the extract // operand. The extraction index gets scaled by that factor. ExtIdx *= (SelElts / CastedElts); } else if (CastedElts % SelElts == 0) { // The select has less (wider) elements than the extract operand. Make sure // that the extraction index can be divided evenly. unsigned IndexDivisor = CastedElts / SelElts; if (ExtIdx % IndexDivisor != 0) return SDValue(); ExtIdx /= IndexDivisor; } else { llvm_unreachable("Element count of simple vector types are not divisible?"); } unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits(); unsigned NarrowElts = SelElts / NarrowingFactor; MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts); SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL); SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL); SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL); SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF); return DAG.getBitcast(VT, NarrowSel); } static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // For AVX1 only, if we are extracting from a 256-bit and+not (which will // eventually get combined/lowered into ANDNP) with a concatenated operand, // split the 'and' into 128-bit ops to avoid the concatenate and extract. // We let generic combining take over from there to simplify the // insert/extract and 'not'. // This pattern emerges during AVX1 legalization. We handle it before lowering // to avoid complications like splitting constant vector loads. // Capture the original wide type in the likely case that we need to bitcast // back to this type. if (!N->getValueType(0).isSimple()) return SDValue(); MVT VT = N->getSimpleValueType(0); SDValue InVec = N->getOperand(0); unsigned IdxVal = N->getConstantOperandVal(1); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); unsigned InSizeInBits = InVecVT.getSizeInBits(); unsigned NumSubElts = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(N); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) && InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) { auto isConcatenatedNot = [](SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) return false; SDValue NotOp = V->getOperand(0); return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; }; if (isConcatenatedNot(InVecBC.getOperand(0)) || isConcatenatedNot(InVecBC.getOperand(1))) { // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, DAG.getBitcast(InVecVT, Concat), N->getOperand(1)); } } if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG)) return V; if (ISD::isBuildVectorAllZeros(InVec.getNode())) return getZeroVector(VT, Subtarget, DAG, DL); if (ISD::isBuildVectorAllOnes(InVec.getNode())) { if (VT.getScalarType() == MVT::i1) return DAG.getConstant(1, DL, VT); return getOnesVector(VT, DAG, DL); } if (InVec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts)); // If we are extracting from an insert into a larger vector, replace with a // smaller insert if we don't access less than the original subvector. Don't // do this for i1 vectors. // TODO: Relax the matching indices requirement? if (VT.getVectorElementType() != MVT::i1 && InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() && IdxVal == InVec.getConstantOperandVal(2) && InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, InVec.getOperand(0), N->getOperand(1)); unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt, InVec.getOperand(1), DAG.getVectorIdxConstant(NewIdxVal, DL)); } // If we're extracting an upper subvector from a broadcast we should just // extract the lowest subvector instead which should allow // SimplifyDemandedVectorElts do more simplifications. if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || InVec.getOpcode() == X86ISD::VBROADCAST_LOAD || DAG.isSplatValue(InVec, /*AllowUndefs*/ false))) return extractSubVector(InVec, 0, DAG, DL, SizeInBits); // If we're extracting a broadcasted subvector, just use the lowest subvector. if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && cast(InVec)->getMemoryVT() == VT) return extractSubVector(InVec, 0, DAG, DL, SizeInBits); // Attempt to extract from the source of a shuffle vector. if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) { SmallVector ShuffleMask; SmallVector ScaledMask; SmallVector ShuffleInputs; unsigned NumSubVecs = InSizeInBits / SizeInBits; // Decode the shuffle mask and scale it so its shuffling subvectors. if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { unsigned SubVecIdx = IdxVal / NumSubElts; if (ScaledMask[SubVecIdx] == SM_SentinelUndef) return DAG.getUNDEF(VT); if (ScaledMask[SubVecIdx] == SM_SentinelZero) return getZeroVector(VT, Subtarget, DAG, DL); SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; if (Src.getValueSizeInBits() == InSizeInBits) { unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts; return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, DL, SizeInBits); } } } auto IsExtractFree = [](SDValue V) { V = peekThroughBitcasts(V); if (ISD::isBuildVectorOfConstantSDNodes(V.getNode())) return true; if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) return true; return V.isUndef(); }; // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. unsigned InOpcode = InVec.getOpcode(); if (InVec.hasOneUse()) { if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0)); } // v2f64 CVTUDQ2PD(v4i32). if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() && InVec.getOperand(0).getValueType() == MVT::v4i32) { return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0)); } // v2f64 CVTPS2PD(v4f32). if (InOpcode == ISD::FP_EXTEND && InVec.getOperand(0).getValueType() == MVT::v4f32) { return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0)); } } // v4i32 CVTPS2DQ(v4f32). if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) { SDValue Src = InVec.getOperand(0); if (Src.getValueType().getScalarType() == MVT::f32) return DAG.getNode(InOpcode, DL, VT, extractSubVector(Src, IdxVal, DAG, DL, SizeInBits)); } if (IdxVal == 0 && (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) && (SizeInBits == 128 || SizeInBits == 256) && InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) { SDValue Ext = InVec.getOperand(0); if (Ext.getValueSizeInBits() > SizeInBits) Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits); unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode); return DAG.getNode(ExtOp, DL, VT, Ext); } if (IdxVal == 0 && InOpcode == ISD::VSELECT && InVec.getOperand(0).getValueType().is256BitVector() && InVec.getOperand(1).getValueType().is256BitVector() && InVec.getOperand(2).getValueType().is256BitVector()) { SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128); SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128); SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); } if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && (SizeInBits == 128 || SizeInBits == 256)) { SDValue InVecSrc = InVec.getOperand(0); unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits; SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); return DAG.getNode(InOpcode, DL, VT, Ext); } if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ || InOpcode == X86ISD::PCMPGT) && (IsExtractFree(InVec.getOperand(0)) || IsExtractFree(InVec.getOperand(1))) && SizeInBits == 128) { SDValue Ext0 = extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); SDValue Ext1 = extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits); if (InOpcode == X86ISD::CMPP) return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2)); return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1); } if (InOpcode == X86ISD::MOVDDUP && (SizeInBits == 128 || SizeInBits == 256)) { SDValue Ext0 = extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); return DAG.getNode(InOpcode, DL, VT, Ext0); } } // Always split vXi64 logical shifts where we're extracting the upper 32-bits // as this is very likely to fold into a shuffle/truncation. if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) && InVecVT.getScalarSizeInBits() == 64 && InVec.getConstantOperandAPInt(1) == 32) { SDValue Ext = extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1)); } return SDValue(); } static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. // This occurs frequently in our masked scalar intrinsic code and our // floating point select lowering with AVX512. // TODO: SimplifyDemandedBits instead? if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() && isOneConstant(Src.getOperand(1))) return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() && Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && isNullConstant(Src.getOperand(1))) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), Src.getOperand(1)); // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero. // TODO: Move to DAGCombine/SimplifyDemandedBits? if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) { auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) { if (Op.getValueType() != MVT::i64) return SDValue(); unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND; if (Op.getOpcode() == Opc && Op.getOperand(0).getScalarValueSizeInBits() <= 32) return Op.getOperand(0); unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD; if (auto *Ld = dyn_cast(Op)) if (Ld->getExtensionType() == Ext && Ld->getMemoryVT().getScalarSizeInBits() <= 32) return Op; if (IsZeroExt) { KnownBits Known = DAG.computeKnownBits(Op); if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32) return Op; } return SDValue(); }; if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false)) return DAG.getBitcast( VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32))); if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true)) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32)))); } // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ. if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST && Src.getOperand(0).getValueType() == MVT::x86mmx) return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0)); // See if we're broadcasting the scalar value, in which case just reuse that. // Ensure the same SDValue from the SDNode use is being used. if (VT.getScalarType() == Src.getValueType()) for (SDNode *User : Src->uses()) if (User->getOpcode() == X86ISD::VBROADCAST && Src == User->getOperand(0)) { unsigned SizeInBits = VT.getFixedSizeInBits(); unsigned BroadcastSizeInBits = User->getValueSizeInBits(0).getFixedValue(); if (BroadcastSizeInBits == SizeInBits) return SDValue(User, 0); if (BroadcastSizeInBits > SizeInBits) return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits); // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test // coverage. } return SDValue(); } // Simplify PMULDQ and PMULUDQ operations. static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); // Canonicalize constant to RHS. if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) && !DAG.isConstantIntBuildVectorOrConstantInt(RHS)) return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS); // Multiply by zero. // Don't return RHS as it may contain UNDEFs. if (ISD::isBuildVectorAllZeros(RHS.getNode())) return DAG.getConstant(0, SDLoc(N), N->getValueType(0)); // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI)) return SDValue(N, 0); // If the input is an extend_invec and the SimplifyDemandedBits call didn't // convert it to any_extend_invec, due to the LegalOperations check, do the // conversion directly to a vector shuffle manually. This exposes combine // opportunities missed by combineEXTEND_VECTOR_INREG not calling // combineX86ShufflesRecursively on SSE4.1 targets. // FIXME: This is basically a hack around several other issues related to // ANY_EXTEND_VECTOR_INREG. if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() && (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && LHS.getOperand(0).getValueType() == MVT::v4i32) { SDLoc dl(N); LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0), LHS.getOperand(0), { 0, -1, 1, -1 }); LHS = DAG.getBitcast(MVT::v2i64, LHS); return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); } if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() && (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && RHS.getOperand(0).getValueType() == MVT::v4i32) { SDLoc dl(N); RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0), RHS.getOperand(0), { 0, -1, 1, -1 }); RHS = DAG.getBitcast(MVT::v2i64, RHS); return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); } return SDValue(); } // Simplify VPMADDUBSW/VPMADDWD operations. static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { MVT VT = N->getSimpleValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); unsigned Opc = N->getOpcode(); bool IsPMADDWD = Opc == X86ISD::VPMADDWD; assert((Opc == X86ISD::VPMADDWD || Opc == X86ISD::VPMADDUBSW) && "Unexpected PMADD opcode"); // Multiply by zero. // Don't return LHS/RHS as it may contain UNDEFs. if (ISD::isBuildVectorAllZeros(LHS.getNode()) || ISD::isBuildVectorAllZeros(RHS.getNode())) return DAG.getConstant(0, SDLoc(N), VT); // Constant folding. APInt LHSUndefs, RHSUndefs; SmallVector LHSBits, RHSBits; unsigned SrcEltBits = LHS.getScalarValueSizeInBits(); unsigned DstEltBits = VT.getScalarSizeInBits(); if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) && getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) { SmallVector Result; for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) { APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1]; APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1]; LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits); LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits); APInt Lo = LHSLo * RHSLo.sext(DstEltBits); APInt Hi = LHSHi * RHSHi.sext(DstEltBits); APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi); Result.push_back(Res); } return getConstVector(Result, VT, DAG, SDLoc(N)); } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); return SDValue(); } static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); unsigned Opcode = N->getOpcode(); unsigned InOpcode = In.getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(N); // Try to merge vector loads and extend_inreg to an extload. if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { auto *Ld = cast(In); if (Ld->isSimple()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; EVT MemVT = VT.changeVectorElementType(SVT); if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { SDValue Load = DAG.getExtLoad( Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; } } } // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X). if (Opcode == InOpcode) return DAG.getNode(Opcode, DL, VT, In.getOperand(0)); // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0)) // -> EXTEND_VECTOR_INREG(X). // TODO: Handle non-zero subvector indices. if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 && In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) && In.getOperand(0).getOperand(0).getValueSizeInBits() == In.getValueSizeInBits()) return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0)); // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0). // TODO: Move to DAGCombine? if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() && In.getValueSizeInBits() == VT.getSizeInBits()) { unsigned NumElts = VT.getVectorNumElements(); unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits(); EVT EltVT = In.getOperand(0).getValueType(); SmallVector Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT)); for (unsigned I = 0; I != NumElts; ++I) Elts[I * Scale] = In.getOperand(I); return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts)); } // Attempt to combine as a shuffle on SSE41+ targets. if (Subtarget.hasSSE41()) { SDValue Op(N, 0); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; } return SDValue(); } static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) return DAG.getConstant(0, SDLoc(N), VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) return SDValue(N, 0); return SDValue(); } // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS. // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce // extra instructions between the conversion due to going to scalar and back. static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16) return SDValue(); if (N->getValueType(0) != MVT::f32 || N->getOperand(0).getOperand(0).getValueType() != MVT::f32) return SDValue(); SDLoc dl(N); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N->getOperand(0).getOperand(0)); Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, DAG.getTargetConstant(4, dl, MVT::i32)); Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, DAG.getIntPtrConstant(0, dl)); } static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); bool IsStrict = N->isStrictFPOpcode(); SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); SDLoc dl(N); if (SrcVT.getScalarType() == MVT::bf16) { if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND && !IsStrict && Src.getOperand(0).getValueType() == VT) return Src.getOperand(0); if (!SrcVT.isVector()) return SDValue(); assert(!IsStrict && "Strict FP doesn't support BF16"); if (VT.getVectorElementType() == MVT::f64) { EVT TmpVT = VT.changeVectorElementType(MVT::f32); return DAG.getNode(ISD::FP_EXTEND, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src)); } assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext"); EVT NVT = SrcVT.changeVectorElementType(MVT::i32); Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src); Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src); Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT)); return DAG.getBitcast(VT, Src); } if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) return SDValue(); if (Subtarget.hasFP16()) return SDValue(); if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16) return SDValue(); if (VT.getVectorElementType() != MVT::f32 && VT.getVectorElementType() != MVT::f64) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 1 || !isPowerOf2_32(NumElts)) return SDValue(); // Convert the input to vXi16. EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); Src = DAG.getBitcast(IntVT, Src); // Widen to at least 8 input elements. if (NumElts < 8) { unsigned NumConcats = 8 / NumElts; SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT) : DAG.getConstant(0, dl, IntVT); SmallVector Ops(NumConcats, Fill); Ops[0] = Src; Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops); } // Destination is vXf32 with at least 4 elements. EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, std::max(4U, NumElts)); SDValue Cvt, Chain; if (IsStrict) { Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other}, {N->getOperand(0), Src}); Chain = Cvt.getValue(1); } else { Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src); } if (NumElts < 4) { assert(NumElts == 2 && "Unexpected size"); Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt, DAG.getIntPtrConstant(0, dl)); } if (IsStrict) { // Extend to the original VT if necessary. if (Cvt.getValueType() != VT) { Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other}, {Chain, Cvt}); Chain = Cvt.getValue(1); } return DAG.getMergeValues({Cvt, Chain}, dl); } // Extend to the original VT if necessary. return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); } // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract // from. Limit this to cases where the loads have the same input chain and the // output chains are unused. This avoids any memory ordering issues. static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"); // Only do this if the chain result is unused. if (N->hasAnyUseOfValue(1)) return SDValue(); auto *MemIntrin = cast(N); SDValue Ptr = MemIntrin->getBasePtr(); SDValue Chain = MemIntrin->getChain(); EVT VT = N->getSimpleValueType(0); EVT MemVT = MemIntrin->getMemoryVT(); // Look at other users of our base pointer and try to find a wider broadcast. // The input chain and the size of the memory VT must match. for (SDNode *User : Ptr->uses()) if (User != N && User->getOpcode() == N->getOpcode() && cast(User)->getBasePtr() == Ptr && cast(User)->getChain() == Chain && cast(User)->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() && !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) { SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), VT.getSizeInBits()); Extract = DAG.getBitcast(VT, Extract); return DCI.CombineTo(N, Extract, SDValue(User, 1)); } return SDValue(); } static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) return SDValue(); bool IsStrict = N->isStrictFPOpcode(); EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || SrcVT.getVectorElementType() != MVT::f32) return SDValue(); SDLoc dl(N); SDValue Cvt, Chain; unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasFP16()) { // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), // v4f32 (xint_to_fp v4i64)))) // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), // v8f16 (CVTXI2P v4i64))) if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS && Src.getNumOperands() == 2) { SDValue Cvt0, Cvt1; SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); bool IsOp0Strict = Op0->isStrictFPOpcode(); if (Op0.getOpcode() != Op1.getOpcode() || Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 || Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) { return SDValue(); } int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11}; if (IsStrict) { assert(IsOp0Strict && "Op0 must be strict node"); unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P; Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, {Op0.getOperand(0), Op0.getOperand(1)}); Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, {Op1.getOperand(0), Op1.getOperand(1)}); Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask); return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl); } unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0)); Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0)); return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask); } return SDValue(); } if (NumElts == 1 || !isPowerOf2_32(NumElts)) return SDValue(); // Widen to at least 4 input elements. if (NumElts < 4) Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getConstantFP(0.0, dl, SrcVT)); // Destination is v8i16 with at least 8 elements. EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts)); SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32); if (IsStrict) { Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other}, {N->getOperand(0), Src, Rnd}); Chain = Cvt.getValue(1); } else { Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd); } // Extract down to real number of elements. if (NumElts < 8) { EVT IntVT = VT.changeVectorElementTypeToInteger(); Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt, DAG.getIntPtrConstant(0, dl)); } Cvt = DAG.getBitcast(VT, Cvt); if (IsStrict) return DAG.getMergeValues({Cvt, Chain}, dl); return Cvt; } static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { SDValue Src = N->getOperand(0); // Turn MOVDQ2Q+simple_load into an mmx load. if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { LoadSDNode *LN = cast(Src.getNode()); if (LN->isSimple()) { SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(), LN->getPointerInfo(), LN->getOriginalAlign(), LN->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1)); return NewLd; } } return SDValue(); } static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { unsigned NumBits = N->getSimpleValueType(0).getSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI)) return SDValue(N, 0); return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { // clang-format off default: break; case ISD::SCALAR_TO_VECTOR: return combineScalarToVector(N, DAG); case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::CONCAT_VECTORS: return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget); case ISD::EXTRACT_SUBVECTOR: return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget); case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget); case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget); case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget); case X86ISD::CLOAD: case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG); case X86ISD::SBB: return combineSBB(N, DAG); case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget); case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget); case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget); case ISD::AVGCEILS: case ISD::AVGCEILU: case ISD::AVGFLOORS: case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget); case X86ISD::BEXTR: case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget); case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); case X86ISD::VEXTRACT_STORE: return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::STRICT_SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); case ISD::UINT_TO_FP: case ISD::STRICT_UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); case ISD::LRINT: case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case X86ISD::VFCMULC: case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: case X86ISD::STRICT_CVTTP2SI: case X86ISD::CVTTP2SI: case X86ISD::STRICT_CVTTP2UI: case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); case X86ISD::STRICT_CVTPH2PS: case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::PACKSS: case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget); case X86ISD::VSHL: case X86ISD::VSRA: case X86ISD::VSRL: return combineVectorShiftVar(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: return combineVectorShiftImm(N, DAG, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::VALIGN: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::BLENDI: case X86ISD::UNPCKH: case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::MOVSH: case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VPERMIL2: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VZEXT_MOVL: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMSUB: case X86ISD::STRICT_FMSUB: case X86ISD::FMSUB_RND: case X86ISD::FNMADD: case X86ISD::STRICT_FNMADD: case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: case X86ISD::STRICT_FNMSUB: case X86ISD::FNMSUB_RND: case ISD::FMA: case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI); case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget); case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget); case X86ISD::MGATHER: case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI); case ISD::MGATHER: case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI); case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget); case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); case X86ISD::VBROADCAST_LOAD: case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); // clang-format on } return SDValue(); } bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const { return false; } // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS. bool X86TargetLowering::preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const { return Subtarget.hasAVX512() || !VT.isVector(); } bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; // There are no vXi8 shifts. if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) return false; // TODO: Almost no 8-bit ops are desirable because they have no actual // size/speed advantages vs. 32-bit ops, but they do have a major // potential disadvantage by causing partial register stalls. // // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and // we have specializations to turn 32-bit multiply/shl into LEA or other ops. // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally // check for a constant operand to the multiply. if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) return false; // i16 instruction encodings are longer and some i16 instructions are slow, // so those are not desirable. if (VT == MVT::i16) { switch (Opc) { default: break; case ISD::LOAD: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::MUL: return false; case ISD::SHL: case ISD::SRA: case ISD::SRL: case ISD::SUB: case ISD::ADD: case ISD::AND: case ISD::OR: case ISD::XOR: // NDD instruction never has "partial register write" issue b/c it has // destination register's upper bits [63:OSIZE]) zeroed even when // OSIZE=8/16. return Subtarget.hasNDD(); } } // Any legal type not explicitly accounted for above here is desirable. return true; } SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const { const Module *M = DAG.getMachineFunction().getFunction().getParent(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); if (IsCFProtectionSupported) { // In case control-flow branch protection is enabled, we need to add // notrack prefix to the indirect branch. // In order to do that we create NT_BRIND SDNode. // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix. SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl); return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr); } return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG); } TargetLowering::AndOrSETCCFoldKind X86TargetLowering::isDesirableToCombineLogicOpOfSETCC( const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const { using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind; EVT VT = LogicOp->getValueType(0); EVT OpVT = SETCC0->getOperand(0).getValueType(); if (!VT.isInteger()) return AndOrSETCCFoldKind::None; if (VT.isVector()) return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd | (isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS : AndOrSETCCFoldKind::None)); // Don't use `NotAnd` as even though `not` is generally shorter code size than // `add`, `add` can lower to LEA which can save moves / spills. Any case where // `NotAnd` applies, `AddAnd` does as well. // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`, // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here. return AndOrSETCCFoldKind::AddAnd; } bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { EVT VT = Op.getValueType(); bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL && isa(Op.getOperand(1)); // i16 is legal, but undesirable since i16 instruction encodings are longer // and some i16 instructions are slow. // 8-bit multiply-by-constant can usually be expanded to something cheaper // using LEA and/or other ALU ops. if (VT != MVT::i16 && !Is8BitMulByConstant) return false; auto IsFoldableRMW = [](SDValue Load, SDValue Op) { if (!Op.hasOneUse()) return false; SDNode *User = *Op->use_begin(); if (!ISD::isNormalStore(User)) return false; auto *Ld = cast(Load); auto *St = cast(User); return Ld->getBasePtr() == St->getBasePtr(); }; auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) { if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD) return false; if (!Op.hasOneUse()) return false; SDNode *User = *Op->use_begin(); if (User->getOpcode() != ISD::ATOMIC_STORE) return false; auto *Ld = cast(Load); auto *St = cast(User); return Ld->getBasePtr() == St->getBasePtr(); }; bool Commute = false; switch (Op.getOpcode()) { default: return false; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: break; case ISD::SHL: case ISD::SRA: case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op)) return false; break; } case ISD::ADD: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: Commute = true; [[fallthrough]]; case ISD::SUB: { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); // Avoid disabling potential load folding opportunities. if (X86::mayFoldLoad(N1, Subtarget) && (!Commute || !isa(N0) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op)))) return false; if (X86::mayFoldLoad(N0, Subtarget) && ((Commute && !isa(N1)) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) return false; if (IsFoldableAtomicRMW(N0, Op) || (Commute && IsFoldableAtomicRMW(N1, Op))) return false; } } PVT = MVT::i32; return true; } //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// // Helper to match a string separated by whitespace. static bool matchAsm(StringRef S, ArrayRef Pieces) { S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. for (StringRef Piece : Pieces) { if (!S.starts_with(Piece)) // Check if the piece matches. return false; S = S.substr(Piece.size()); StringRef::size_type Pos = S.find_first_not_of(" \t"); if (Pos == 0) // We matched a prefix. return false; S = S.substr(Pos); } return S.empty(); } static bool clobbersFlagRegisters(const SmallVector &AsmPieces) { if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { if (llvm::is_contained(AsmPieces, "~{cc}") && llvm::is_contained(AsmPieces, "~{flags}") && llvm::is_contained(AsmPieces, "~{fpsr}")) { if (AsmPieces.size() == 3) return true; else if (llvm::is_contained(AsmPieces, "~{dirflag}")) return true; } } return false; } bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast(CI->getCalledOperand()); const std::string &AsmStr = IA->getAsmString(); IntegerType *Ty = dyn_cast(CI->getType()); if (!Ty || Ty->getBitWidth() % 16 != 0) return false; // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical // ops instead of emitting the bswap asm. For now, we don't support 486 or // lower so don't worry about this. // bswap $0 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || matchAsm(AsmPieces[0], {"bswapl", "$0"}) || matchAsm(AsmPieces[0], {"bswapq", "$0"}) || matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. return IntrinsicLowering::LowerToByteSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } break; case 3: if (CI->getType()->isIntegerTy(32) && IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { AsmPieces.clear(); StringRef ConstraintsStr = IA->getConstraintString(); SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); array_pod_sort(AsmPieces.begin(), AsmPieces.end()); if (clobbersFlagRegisters(AsmPieces)) return IntrinsicLowering::LowerToByteSwap(CI); } if (CI->getType()->isIntegerTy(64)) { InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); if (Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && matchAsm(AsmPieces[1], {"bswap", "%edx"}) && matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) return IntrinsicLowering::LowerToByteSwap(CI); } } break; } return false; } static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { X86::CondCode Cond = StringSwitch(Constraint) .Case("{@cca}", X86::COND_A) .Case("{@ccae}", X86::COND_AE) .Case("{@ccb}", X86::COND_B) .Case("{@ccbe}", X86::COND_BE) .Case("{@ccc}", X86::COND_B) .Case("{@cce}", X86::COND_E) .Case("{@ccz}", X86::COND_E) .Case("{@ccg}", X86::COND_G) .Case("{@ccge}", X86::COND_GE) .Case("{@ccl}", X86::COND_L) .Case("{@ccle}", X86::COND_LE) .Case("{@ccna}", X86::COND_BE) .Case("{@ccnae}", X86::COND_B) .Case("{@ccnb}", X86::COND_AE) .Case("{@ccnbe}", X86::COND_A) .Case("{@ccnc}", X86::COND_AE) .Case("{@ccne}", X86::COND_NE) .Case("{@ccnz}", X86::COND_NE) .Case("{@ccng}", X86::COND_LE) .Case("{@ccnge}", X86::COND_L) .Case("{@ccnl}", X86::COND_GE) .Case("{@ccnle}", X86::COND_G) .Case("{@ccno}", X86::COND_NO) .Case("{@ccnp}", X86::COND_NP) .Case("{@ccns}", X86::COND_NS) .Case("{@cco}", X86::COND_O) .Case("{@ccp}", X86::COND_P) .Case("{@ccs}", X86::COND_S) .Default(X86::COND_INVALID); return Cond; } /// Given a constraint letter, return the type of constraint for this target. X86TargetLowering::ConstraintType X86TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'R': case 'q': case 'Q': case 'f': case 't': case 'u': case 'y': case 'x': case 'v': case 'l': case 'k': // AVX512 masking registers. return C_RegisterClass; case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': return C_Register; case 'I': case 'J': case 'K': case 'N': case 'G': case 'L': case 'M': return C_Immediate; case 'C': case 'e': case 'Z': return C_Other; default: break; } } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; case 'W': if (Constraint[1] != 's') break; return C_Other; case 'Y': switch (Constraint[1]) { default: break; case 'z': return C_Register; case 'i': case 'm': case 'k': case 't': case '2': return C_RegisterClass; } break; case 'j': switch (Constraint[1]) { default: break; case 'r': case 'R': return C_RegisterClass; } } } else if (parseConstraintCode(Constraint) != X86::COND_INVALID) return C_Other; return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight X86TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &Info, const char *Constraint) const { ConstraintWeight Wt = CW_Invalid; Value *CallOperandVal = Info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *Ty = CallOperandVal->getType(); // Look at the constraint type. switch (*Constraint) { default: Wt = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint); [[fallthrough]]; case 'R': case 'q': case 'Q': case 'a': case 'b': case 'c': case 'd': case 'S': case 'D': case 'A': if (CallOperandVal->getType()->isIntegerTy()) Wt = CW_SpecificReg; break; case 'f': case 't': case 'u': if (Ty->isFloatingPointTy()) Wt = CW_SpecificReg; break; case 'y': if (Ty->isX86_MMXTy() && Subtarget.hasMMX()) Wt = CW_SpecificReg; break; case 'Y': if (StringRef(Constraint).size() != 2) break; switch (Constraint[1]) { default: return CW_Invalid; // XMM0 case 'z': if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) || ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())) return CW_SpecificReg; return CW_Invalid; // Conditional OpMask regs (AVX512) case 'k': if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) return CW_Register; return CW_Invalid; // Any MMX reg case 'm': if (Ty->isX86_MMXTy() && Subtarget.hasMMX()) return Wt; return CW_Invalid; // Any SSE reg when ISA >= SSE2, same as 'x' case 'i': case 't': case '2': if (!Subtarget.hasSSE2()) return CW_Invalid; break; } break; case 'j': if (StringRef(Constraint).size() != 2) break; switch (Constraint[1]) { default: return CW_Invalid; case 'r': case 'R': if (CallOperandVal->getType()->isIntegerTy()) Wt = CW_SpecificReg; break; } break; case 'v': if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) Wt = CW_Register; [[fallthrough]]; case 'x': if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX())) Wt = CW_Register; break; case 'k': // Enable conditional vector operations using %k<#> registers. if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) Wt = CW_Register; break; case 'I': if (auto *C = dyn_cast(Info.CallOperandVal)) if (C->getZExtValue() <= 31) Wt = CW_Constant; break; case 'J': if (auto *C = dyn_cast(CallOperandVal)) if (C->getZExtValue() <= 63) Wt = CW_Constant; break; case 'K': if (auto *C = dyn_cast(CallOperandVal)) if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) Wt = CW_Constant; break; case 'L': if (auto *C = dyn_cast(CallOperandVal)) if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) Wt = CW_Constant; break; case 'M': if (auto *C = dyn_cast(CallOperandVal)) if (C->getZExtValue() <= 3) Wt = CW_Constant; break; case 'N': if (auto *C = dyn_cast(CallOperandVal)) if (C->getZExtValue() <= 0xff) Wt = CW_Constant; break; case 'G': case 'C': if (isa(CallOperandVal)) Wt = CW_Constant; break; case 'e': if (auto *C = dyn_cast(CallOperandVal)) if ((C->getSExtValue() >= -0x80000000LL) && (C->getSExtValue() <= 0x7fffffffLL)) Wt = CW_Constant; break; case 'Z': if (auto *C = dyn_cast(CallOperandVal)) if (C->getZExtValue() <= 0xffffffff) Wt = CW_Constant; break; } return Wt; } /// Try to replace an X constraint, which matches anything, with another that /// has more specific requirements based on the type of the corresponding /// operand. const char *X86TargetLowering:: LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { if (Subtarget.hasSSE1()) return "x"; } return TargetLowering::LowerXConstraint(ConstraintVT); } // Lower @cc targets via setcc. SDValue X86TargetLowering::LowerAsmOutputForConstraint( SDValue &Chain, SDValue &Glue, const SDLoc &DL, const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); if (Cond == X86::COND_INVALID) return SDValue(); // Check that return type is valid. if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || OpInfo.ConstraintVT.getSizeInBits() < 8) report_fatal_error("Glue output operand is of invalid type"); // Get EFLAGS register. Only update chain when copyfrom is glued. if (Glue.getNode()) { Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue); Chain = Glue.getValue(1); } else Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32); // Extract CC code. SDValue CC = getSETCC(Cond, Glue, DL, DAG); // Extend to 32-bits SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); return Result; } /// Lower the specified operand into the Ops vector. /// If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector &Ops, SelectionDAG &DAG) const { SDValue Result; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'I': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 31) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'J': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 63) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'K': if (auto *C = dyn_cast(Op)) { if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'L': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'M': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 3) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'N': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 255) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'O': if (auto *C = dyn_cast(Op)) { if (C->getZExtValue() <= 127) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } return; case 'e': { // 32-bit signed value if (auto *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getSExtValue())) { // Widen to 64 bits here to get it sign extended. Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); break; } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. } return; } case 'W': { assert(Constraint[1] == 's'); // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional // offset. if (const auto *BA = dyn_cast(Op)) { Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0))); } else { int64_t Offset = 0; if (Op->getOpcode() == ISD::ADD && isa(Op->getOperand(1))) { Offset = cast(Op->getOperand(1))->getSExtValue(); Op = Op->getOperand(0); } if (const auto *GA = dyn_cast(Op)) Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), GA->getValueType(0), Offset)); } return; } case 'Z': { // 32-bit unsigned value if (auto *C = dyn_cast(Op)) { if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), C->getZExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), Op.getValueType()); break; } } // FIXME gcc accepts some relocatable values here too, but only in certain // memory models; it's complicated. return; } case 'i': { // Literal immediates are always ok. if (auto *CST = dyn_cast(Op)) { bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; BooleanContent BCont = getBooleanContents(MVT::i64); ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND; int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() : CST->getSExtValue(); Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); break; } // In any sort of PIC mode addresses need to be computed at runtime by // adding in a register or some sort of table lookup. These can't // be used as immediates. BlockAddresses and BasicBlocks are fine though. if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) && !(isa(Op) || isa(Op))) return; // If we are in non-pic codegen mode, we allow the address of a global (with // an optional displacement) to be used with 'i'. if (auto *GA = dyn_cast(Op)) // If we require an extra load to get this address, as in PIC mode, we // can't accept it. if (isGlobalStubReference( Subtarget.classifyGlobalReference(GA->getGlobal()))) return; break; } } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } /// Check if \p RC is a general purpose register class. /// I.e., GR* or one of their variant. static bool isGRClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::GR8RegClass) || RC.hasSuperClassEq(&X86::GR16RegClass) || RC.hasSuperClassEq(&X86::GR32RegClass) || RC.hasSuperClassEq(&X86::GR64RegClass) || RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass); } /// Check if \p RC is a vector register class. /// I.e., FR* / VR* or one of their variant. static bool isFRClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::FR16XRegClass) || RC.hasSuperClassEq(&X86::FR32XRegClass) || RC.hasSuperClassEq(&X86::FR64XRegClass) || RC.hasSuperClassEq(&X86::VR128XRegClass) || RC.hasSuperClassEq(&X86::VR256XRegClass) || RC.hasSuperClassEq(&X86::VR512RegClass); } /// Check if \p RC is a mask register class. /// I.e., VK* or one of their variant. static bool isVKClass(const TargetRegisterClass &RC) { return RC.hasSuperClassEq(&X86::VK1RegClass) || RC.hasSuperClassEq(&X86::VK2RegClass) || RC.hasSuperClassEq(&X86::VK4RegClass) || RC.hasSuperClassEq(&X86::VK8RegClass) || RC.hasSuperClassEq(&X86::VK16RegClass) || RC.hasSuperClassEq(&X86::VK32RegClass) || RC.hasSuperClassEq(&X86::VK64RegClass); } static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) { return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32(); } std::pair X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { // First, see if this is a constraint that directly corresponds to an LLVM // register class. if (Constraint.size() == 1) { // GCC Constraint Letters switch (Constraint[0]) { default: break; // 'A' means [ER]AX + [ER]DX. case 'A': if (Subtarget.is64Bit()) return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); assert((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"); return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); // TODO: Slight differences here in allocation order and leaving // RIP in the class. Do they matter any more here than they do // in the normal allocation? case 'k': if (Subtarget.hasAVX512()) { if (VT == MVT::v1i1 || VT == MVT::i1) return std::make_pair(0U, &X86::VK1RegClass); if (VT == MVT::v8i1 || VT == MVT::i8) return std::make_pair(0U, &X86::VK8RegClass); if (VT == MVT::v16i1 || VT == MVT::i16) return std::make_pair(0U, &X86::VK16RegClass); } if (Subtarget.hasBWI()) { if (VT == MVT::v32i1 || VT == MVT::i32) return std::make_pair(0U, &X86::VK32RegClass); if (VT == MVT::v64i1 || VT == MVT::i64) return std::make_pair(0U, &X86::VK64RegClass); } break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget.is64Bit()) { if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR8RegClass : &X86::GR8_NOREX2RegClass); if (VT == MVT::i16) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR16RegClass : &X86::GR16_NOREX2RegClass); if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR32RegClass : &X86::GR32_NOREX2RegClass); if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR64RegClass : &X86::GR64_NOREX2RegClass); break; } [[fallthrough]]; // 32-bit fallthrough case 'Q': // Q_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_ABCDRegClass); if (VT == MVT::i32 || VT == MVT::f32 || (!VT.isVector() && !Subtarget.is64Bit())) return std::make_pair(0U, &X86::GR32_ABCDRegClass); if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS case 'l': // INDEX_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR8RegClass : &X86::GR8_NOREX2RegClass); if (VT == MVT::i16) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR16RegClass : &X86::GR16_NOREX2RegClass); if (VT == MVT::i32 || VT == MVT::f32 || (!VT.isVector() && !Subtarget.is64Bit())) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR32RegClass : &X86::GR32_NOREX2RegClass); if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, useEGPRInlineAsm(Subtarget) ? &X86::GR64RegClass : &X86::GR64_NOREX2RegClass); break; case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); if (VT == MVT::i32 || VT == MVT::f32 || (!VT.isVector() && !Subtarget.is64Bit())) return std::make_pair(0U, &X86::GR32_NOREXRegClass); if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, &X86::GR64_NOREXRegClass); break; case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP64RegClass); if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) return std::make_pair(0U, &X86::RFP80RegClass); break; case 'y': // MMX_REGS if MMX allowed. if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'v': case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget.hasSSE1()) break; bool VConstraint = (Constraint[0] == 'v'); switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f16: if (VConstraint && Subtarget.hasFP16()) return std::make_pair(0U, &X86::FR16XRegClass); break; case MVT::f32: case MVT::i32: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR32XRegClass); return std::make_pair(0U, &X86::FR32RegClass); case MVT::f64: case MVT::i64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); case MVT::i128: if (Subtarget.is64Bit()) { if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); } break; // Vector types and fp128. case MVT::v8f16: if (!Subtarget.hasFP16()) break; if (VConstraint) return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); case MVT::v8bf16: if (!Subtarget.hasBF16() || !Subtarget.hasVLX()) break; if (VConstraint) return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); case MVT::f128: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR128XRegClass); return std::make_pair(0U, &X86::VR128RegClass); // AVX types. case MVT::v16f16: if (!Subtarget.hasFP16()) break; if (VConstraint) return std::make_pair(0U, &X86::VR256XRegClass); return std::make_pair(0U, &X86::VR256RegClass); case MVT::v16bf16: if (!Subtarget.hasBF16() || !Subtarget.hasVLX()) break; if (VConstraint) return std::make_pair(0U, &X86::VR256XRegClass); return std::make_pair(0U, &X86::VR256RegClass); case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::VR256XRegClass); if (Subtarget.hasAVX()) return std::make_pair(0U, &X86::VR256RegClass); break; case MVT::v32f16: if (!Subtarget.hasFP16()) break; if (VConstraint) return std::make_pair(0U, &X86::VR512RegClass); return std::make_pair(0U, &X86::VR512_0_15RegClass); case MVT::v32bf16: if (!Subtarget.hasBF16()) break; if (VConstraint) return std::make_pair(0U, &X86::VR512RegClass); return std::make_pair(0U, &X86::VR512_0_15RegClass); case MVT::v64i8: case MVT::v32i16: case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: if (!Subtarget.hasAVX512()) break; if (VConstraint) return std::make_pair(0U, &X86::VR512RegClass); return std::make_pair(0U, &X86::VR512_0_15RegClass); } break; } } else if (Constraint.size() == 2 && Constraint[0] == 'Y') { switch (Constraint[1]) { default: break; case 'i': case 't': case '2': return getRegForInlineAsmConstraint(TRI, "x", VT); case 'm': if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'z': if (!Subtarget.hasSSE1()) break; switch (VT.SimpleTy) { default: break; // Scalar SSE types. case MVT::f16: if (!Subtarget.hasFP16()) break; return std::make_pair(X86::XMM0, &X86::FR16XRegClass); case MVT::f32: case MVT::i32: return std::make_pair(X86::XMM0, &X86::FR32RegClass); case MVT::f64: case MVT::i64: return std::make_pair(X86::XMM0, &X86::FR64RegClass); case MVT::v8f16: if (!Subtarget.hasFP16()) break; return std::make_pair(X86::XMM0, &X86::VR128RegClass); case MVT::v8bf16: if (!Subtarget.hasBF16() || !Subtarget.hasVLX()) break; return std::make_pair(X86::XMM0, &X86::VR128RegClass); case MVT::f128: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: return std::make_pair(X86::XMM0, &X86::VR128RegClass); // AVX types. case MVT::v16f16: if (!Subtarget.hasFP16()) break; return std::make_pair(X86::YMM0, &X86::VR256RegClass); case MVT::v16bf16: if (!Subtarget.hasBF16() || !Subtarget.hasVLX()) break; return std::make_pair(X86::YMM0, &X86::VR256RegClass); case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: if (Subtarget.hasAVX()) return std::make_pair(X86::YMM0, &X86::VR256RegClass); break; case MVT::v32f16: if (!Subtarget.hasFP16()) break; return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass); case MVT::v32bf16: if (!Subtarget.hasBF16()) break; return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass); case MVT::v64i8: case MVT::v32i16: case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: case MVT::v8i64: if (Subtarget.hasAVX512()) return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass); break; } break; case 'k': // This register class doesn't allocate k0 for masked vector operation. if (Subtarget.hasAVX512()) { if (VT == MVT::v1i1 || VT == MVT::i1) return std::make_pair(0U, &X86::VK1WMRegClass); if (VT == MVT::v8i1 || VT == MVT::i8) return std::make_pair(0U, &X86::VK8WMRegClass); if (VT == MVT::v16i1 || VT == MVT::i16) return std::make_pair(0U, &X86::VK16WMRegClass); } if (Subtarget.hasBWI()) { if (VT == MVT::v32i1 || VT == MVT::i32) return std::make_pair(0U, &X86::VK32WMRegClass); if (VT == MVT::v64i1 || VT == MVT::i64) return std::make_pair(0U, &X86::VK64WMRegClass); } break; } } else if (Constraint.size() == 2 && Constraint[0] == 'j') { switch (Constraint[1]) { default: break; case 'r': if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREX2RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREX2RegClass); if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32_NOREX2RegClass); if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, &X86::GR64_NOREX2RegClass); break; case 'R': if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32) return std::make_pair(0U, &X86::GR32RegClass); if (VT != MVT::f80 && !VT.isVector()) return std::make_pair(0U, &X86::GR64RegClass); break; } } if (parseConstraintCode(Constraint) != X86::COND_INVALID) return std::make_pair(0U, &X86::GR32RegClass); // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { // Only match x87 registers if the VT is one SelectionDAGBuilder can convert // to/from f80. if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) { // Map st(0) -> st(7) -> ST0 if (Constraint.size() == 7 && Constraint[0] == '{' && tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && Constraint[3] == '(' && (Constraint[4] >= '0' && Constraint[4] <= '7') && Constraint[5] == ')' && Constraint[6] == '}') { // st(7) is not allocatable and thus not a member of RFP80. Return // singleton class in cases where we have a reference to it. if (Constraint[4] == '7') return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); return std::make_pair(X86::FP0 + Constraint[4] - '0', &X86::RFP80RegClass); } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_insensitive(Constraint)) return std::make_pair(X86::FP0, &X86::RFP80RegClass); } // flags -> EFLAGS if (StringRef("{flags}").equals_insensitive(Constraint)) return std::make_pair(X86::EFLAGS, &X86::CCRRegClass); // dirflag -> DF // Only allow for clobber. if (StringRef("{dirflag}").equals_insensitive(Constraint) && VT == MVT::Other) return std::make_pair(X86::DF, &X86::DFCCRRegClass); // fpsr -> FPSW // Only allow for clobber. if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other) return std::make_pair(X86::FPSW, &X86::FPCCRRegClass); return Res; } // Make sure it isn't a register that requires 64-bit mode. if (!Subtarget.is64Bit() && (isFRClass(*Res.second) || isGRClass(*Res.second)) && TRI->getEncodingValue(Res.first) >= 8) { // Register requires REX prefix, but we're in 32-bit mode. return std::make_pair(0, nullptr); } // Make sure it isn't a register that requires AVX512. if (!Subtarget.hasAVX512() && isFRClass(*Res.second) && TRI->getEncodingValue(Res.first) & 0x10) { // Register requires EVEX prefix. return std::make_pair(0, nullptr); } // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. // MVT::Other is used to specify clobber names. if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should // return "eax". This should even work for things like getting 64bit integer // registers when given an f64 type. const TargetRegisterClass *Class = Res.second; // The generic code will match the first register class that contains the // given register. Thus, based on the ordering of the tablegened file, // the "plain" GR classes might not come first. // Therefore, use a helper method. if (isGRClass(*Class)) { unsigned Size = VT.getSizeInBits(); if (Size == 1) Size = 8; if (Size != 8 && Size != 16 && Size != 32 && Size != 64) return std::make_pair(0, nullptr); Register DestReg = getX86SubSuperRegister(Res.first, Size); if (DestReg.isValid()) { bool is64Bit = Subtarget.is64Bit(); const TargetRegisterClass *RC = Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass) : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr); if (Size == 64 && !is64Bit) { // Model GCC's behavior here and select a fixed pair of 32-bit // registers. switch (DestReg) { case X86::RAX: return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); case X86::RDX: return std::make_pair(X86::EDX, &X86::GR32_DCRegClass); case X86::RCX: return std::make_pair(X86::ECX, &X86::GR32_CBRegClass); case X86::RBX: return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass); case X86::RSI: return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass); case X86::RDI: return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass); case X86::RBP: return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass); default: return std::make_pair(0, nullptr); } } if (RC && RC->contains(DestReg)) return std::make_pair(DestReg, RC); return Res; } // No register found/type mismatch. return std::make_pair(0, nullptr); } else if (isFRClass(*Class)) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can // find, ignoring the required type. // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f16) Res.second = &X86::FR16XRegClass; else if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32XRegClass; else if (VT == MVT::f64 || VT == MVT::i64) Res.second = &X86::FR64XRegClass; else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT)) Res.second = &X86::VR128XRegClass; else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT)) Res.second = &X86::VR256XRegClass; else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) Res.second = &X86::VR512RegClass; else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } } else if (isVKClass(*Class)) { if (VT == MVT::v1i1 || VT == MVT::i1) Res.second = &X86::VK1RegClass; else if (VT == MVT::v8i1 || VT == MVT::i8) Res.second = &X86::VK8RegClass; else if (VT == MVT::v16i1 || VT == MVT::i16) Res.second = &X86::VK16RegClass; else if (VT == MVT::v32i1 || VT == MVT::i32) Res.second = &X86::VK32RegClass; else if (VT == MVT::v64i1 || VT == MVT::i64) Res.second = &X86::VK64RegClass; else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } } return Res; } bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. // The exception to this is vector division. Since x86 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasFnAttr(Attribute::MinSize); return OptSize && !VT.isVector(); } void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { if (!Subtarget.is64Bit()) return; // Update IsSplitCSR in X86MachineFunctionInfo. X86MachineFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void X86TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (X86::GR64RegClass.contains(*I)) RC = &X86::GR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert( Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } bool X86TargetLowering::supportSwiftError() const { return Subtarget.is64Bit(); } MachineInstr * X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const { assert(MBBI->isCall() && MBBI->getCFIType() && "Invalid call instruction for a KCFI check"); MachineFunction &MF = *MBB.getParent(); // If the call target is a memory operand, unfold it and use R11 for the // call, so KCFI_CHECK won't have to recompute the address. switch (MBBI->getOpcode()) { case X86::CALL64m: case X86::CALL64m_NT: case X86::TAILJMPm64: case X86::TAILJMPm64_REX: { MachineBasicBlock::instr_iterator OrigCall = MBBI; SmallVector NewMIs; if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true, /*UnfoldStore=*/false, NewMIs)) report_fatal_error("Failed to unfold memory operand for a KCFI check"); for (auto *NewMI : NewMIs) MBBI = MBB.insert(OrigCall, NewMI); assert(MBBI->isCall() && "Unexpected instruction after memory operand unfolding"); if (OrigCall->shouldUpdateCallSiteInfo()) MF.moveCallSiteInfo(&*OrigCall, &*MBBI); MBBI->setCFIType(MF, OrigCall->getCFIType()); OrigCall->eraseFromParent(); break; } default: break; } MachineOperand &Target = MBBI->getOperand(0); Register TargetReg; switch (MBBI->getOpcode()) { case X86::CALL64r: case X86::CALL64r_NT: case X86::TAILJMPr64: case X86::TAILJMPr64_REX: assert(Target.isReg() && "Unexpected target operand for an indirect call"); Target.setIsRenamable(false); TargetReg = Target.getReg(); break; case X86::CALL64pcrel32: case X86::TAILJMPd64: assert(Target.isSymbol() && "Unexpected target operand for a direct call"); // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for // 64-bit indirect thunk calls. assert(StringRef(Target.getSymbolName()).ends_with("_r11") && "Unexpected register for an indirect thunk call"); TargetReg = X86::R11; break; default: llvm_unreachable("Unexpected CFI call opcode"); break; } return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK)) .addReg(TargetReg) .addImm(MBBI->getCFIType()) .getInstr(); } /// Returns true if stack probing through a function call is requested. bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const { return !getStackProbeSymbolName(MF).empty(); } /// Returns true if stack probing through inline assembly is requested. bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const { // No inline stack probe for Windows, they have their own mechanism. if (Subtarget.isOSWindows() || MF.getFunction().hasFnAttribute("no-stack-arg-probe")) return false; // If the function specifically requests inline stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == "inline-asm"; return false; } /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const { // Inline Stack probes disable stack probe call if (hasInlineStackProbe(MF)) return ""; // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); // Generally, if we aren't on Windows, the platform ABI does not include // support for stack probes, so don't emit them. if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() || MF.getFunction().hasFnAttribute("no-stack-arg-probe")) return ""; // We need a stack probe to conform to the Windows ABI. Choose the right // symbol. if (Subtarget.is64Bit()) return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } unsigned X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const { // The default stack probe size is 4096 if the function has no stackprobesize // attribute. return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size", 4096); } Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { if (ML && ML->isInnermost() && ExperimentalPrefInnermostLoopAlignment.getNumOccurrences()) return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); return TargetLowering::getPrefLoopAlignment(); }