mirror of
https://github.com/Gericom/teak-llvm.git
synced 2025-06-19 11:35:51 -04:00

Summary: This patch pushes the AIX vararg unimplemented error diagnostic later and allows vararg calls so long as all the arguments can be passed in register. This patch extends the AIX calling convention implementation to initialize GPR(s) for vararg float arguments. On AIX, both GPR(s) and FPR are allocated for floating point arguments. The GPR(s) are only initialized for vararg calls, otherwise the callee is expected to retrieve the float argument in the FPR. f64 in AIX PPC32 requires special handling in order to allocated and initialize 2 GPRs. This is performed with bitcast, SRL, truncation to initialize one GPR for the MSW and bitcast, truncations to initialize the other GPR for the LSW. A future patch will follow to add support for arguments passed on the stack. Patch provided by: cebowleratibm Reviewers: sfertile, ZarkoCA, hubert.reinterpretcast Differential Revision: https://reviews.llvm.org/D71013
15858 lines
615 KiB
C++
15858 lines
615 KiB
C++
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the PPCISelLowering class.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "PPCISelLowering.h"
|
|
#include "MCTargetDesc/PPCPredicates.h"
|
|
#include "PPC.h"
|
|
#include "PPCCCState.h"
|
|
#include "PPCCallingConv.h"
|
|
#include "PPCFrameLowering.h"
|
|
#include "PPCInstrInfo.h"
|
|
#include "PPCMachineFunctionInfo.h"
|
|
#include "PPCPerfectShuffle.h"
|
|
#include "PPCRegisterInfo.h"
|
|
#include "PPCSubtarget.h"
|
|
#include "PPCTargetMachine.h"
|
|
#include "llvm/ADT/APFloat.h"
|
|
#include "llvm/ADT/APInt.h"
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/None.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallSet.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/ADT/StringSwitch.h"
|
|
#include "llvm/CodeGen/CallingConvLower.h"
|
|
#include "llvm/CodeGen/ISDOpcodes.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
#include "llvm/CodeGen/MachineJumpTableInfo.h"
|
|
#include "llvm/CodeGen/MachineLoopInfo.h"
|
|
#include "llvm/CodeGen/MachineMemOperand.h"
|
|
#include "llvm/CodeGen/MachineModuleInfo.h"
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
#include "llvm/CodeGen/RuntimeLibcalls.h"
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
|
#include "llvm/CodeGen/SelectionDAGNodes.h"
|
|
#include "llvm/CodeGen/TargetInstrInfo.h"
|
|
#include "llvm/CodeGen/TargetLowering.h"
|
|
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
|
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
|
#include "llvm/CodeGen/ValueTypes.h"
|
|
#include "llvm/IR/CallSite.h"
|
|
#include "llvm/IR/CallingConv.h"
|
|
#include "llvm/IR/Constant.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/DataLayout.h"
|
|
#include "llvm/IR/DebugLoc.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/GlobalValue.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/IntrinsicsPowerPC.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Use.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/MC/MCContext.h"
|
|
#include "llvm/MC/MCExpr.h"
|
|
#include "llvm/MC/MCRegisterInfo.h"
|
|
#include "llvm/MC/MCSymbolXCOFF.h"
|
|
#include "llvm/Support/AtomicOrdering.h"
|
|
#include "llvm/Support/BranchProbability.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/CodeGen.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Compiler.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Support/Format.h"
|
|
#include "llvm/Support/KnownBits.h"
|
|
#include "llvm/Support/MachineValueType.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/Target/TargetOptions.h"
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <iterator>
|
|
#include <list>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "ppc-lowering"
|
|
|
|
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
|
|
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
|
|
|
|
static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
|
|
cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
|
|
|
|
static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
|
|
cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
|
|
|
|
static cl::opt<bool> DisableSCO("disable-ppc-sco",
|
|
cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
|
|
|
|
static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
|
|
cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
|
|
|
|
static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
|
|
cl::desc("enable quad precision float support on ppc"), cl::Hidden);
|
|
|
|
static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
|
|
cl::desc("use absolute jump tables on ppc"), cl::Hidden);
|
|
|
|
STATISTIC(NumTailCalls, "Number of tail calls");
|
|
STATISTIC(NumSiblingCalls, "Number of sibling calls");
|
|
|
|
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
|
|
|
|
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
|
|
|
|
// FIXME: Remove this once the bug has been fixed!
|
|
extern cl::opt<bool> ANDIGlueBug;
|
|
|
|
PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
|
|
const PPCSubtarget &STI)
|
|
: TargetLowering(TM), Subtarget(STI) {
|
|
// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
|
|
// arguments are at least 4/8 bytes aligned.
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
|
|
|
|
// Set up the register classes.
|
|
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
|
|
if (!useSoftFloat()) {
|
|
if (hasSPE()) {
|
|
addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
|
|
addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
|
|
} else {
|
|
addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
|
|
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
|
|
}
|
|
}
|
|
|
|
// Match BITREVERSE to customized fast code sequence in the td file.
|
|
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
|
|
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
|
|
|
|
// Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
|
|
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
|
|
|
|
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
|
|
for (MVT VT : MVT::integer_valuetypes()) {
|
|
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
|
|
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
|
|
}
|
|
|
|
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
|
|
|
|
// PowerPC has pre-inc load and store's.
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
|
|
if (!Subtarget.hasSPE()) {
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
|
|
}
|
|
|
|
// PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
|
|
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
|
|
for (MVT VT : ScalarIntVTs) {
|
|
setOperationAction(ISD::ADDC, VT, Legal);
|
|
setOperationAction(ISD::ADDE, VT, Legal);
|
|
setOperationAction(ISD::SUBC, VT, Legal);
|
|
setOperationAction(ISD::SUBE, VT, Legal);
|
|
}
|
|
|
|
if (Subtarget.useCRBits()) {
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
|
|
|
|
if (isPPC64 || Subtarget.hasFPCVT()) {
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
|
|
AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
|
|
AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
|
} else {
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
|
|
}
|
|
|
|
// PowerPC does not support direct load/store of condition registers.
|
|
setOperationAction(ISD::LOAD, MVT::i1, Custom);
|
|
setOperationAction(ISD::STORE, MVT::i1, Custom);
|
|
|
|
// FIXME: Remove this once the ANDI glue bug is fixed:
|
|
if (ANDIGlueBug)
|
|
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
|
|
|
|
for (MVT VT : MVT::integer_valuetypes()) {
|
|
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
|
|
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
|
|
setTruncStoreAction(VT, MVT::i1, Expand);
|
|
}
|
|
|
|
addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
|
|
}
|
|
|
|
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
|
|
// PPC (the libcall is not available).
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
|
|
|
|
// We do not currently implement these libm ops for PowerPC.
|
|
setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
|
|
setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
|
|
setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
|
|
setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
|
|
setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
|
|
setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
|
|
|
|
// PowerPC has no SREM/UREM instructions unless we are on P9
|
|
// On P9 we may use a hardware instruction to compute the remainder.
|
|
// The instructions are not legalized directly because in the cases where the
|
|
// result of both the remainder and the division is required it is more
|
|
// efficient to compute the remainder from the result of the division rather
|
|
// than use the remainder instruction.
|
|
if (Subtarget.isISA3_0()) {
|
|
setOperationAction(ISD::SREM, MVT::i32, Custom);
|
|
setOperationAction(ISD::UREM, MVT::i32, Custom);
|
|
setOperationAction(ISD::SREM, MVT::i64, Custom);
|
|
setOperationAction(ISD::UREM, MVT::i64, Custom);
|
|
} else {
|
|
setOperationAction(ISD::SREM, MVT::i32, Expand);
|
|
setOperationAction(ISD::UREM, MVT::i32, Expand);
|
|
setOperationAction(ISD::SREM, MVT::i64, Expand);
|
|
setOperationAction(ISD::UREM, MVT::i64, Expand);
|
|
}
|
|
|
|
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
|
|
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
|
|
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
|
|
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
|
|
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
|
|
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
|
|
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
|
|
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
|
|
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
|
|
|
|
// We don't support sin/cos/sqrt/fmod/pow
|
|
setOperationAction(ISD::FSIN , MVT::f64, Expand);
|
|
setOperationAction(ISD::FCOS , MVT::f64, Expand);
|
|
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
|
|
setOperationAction(ISD::FREM , MVT::f64, Expand);
|
|
setOperationAction(ISD::FPOW , MVT::f64, Expand);
|
|
setOperationAction(ISD::FSIN , MVT::f32, Expand);
|
|
setOperationAction(ISD::FCOS , MVT::f32, Expand);
|
|
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
|
|
setOperationAction(ISD::FREM , MVT::f32, Expand);
|
|
setOperationAction(ISD::FPOW , MVT::f32, Expand);
|
|
if (Subtarget.hasSPE()) {
|
|
setOperationAction(ISD::FMA , MVT::f64, Expand);
|
|
setOperationAction(ISD::FMA , MVT::f32, Expand);
|
|
} else {
|
|
setOperationAction(ISD::FMA , MVT::f64, Legal);
|
|
setOperationAction(ISD::FMA , MVT::f32, Legal);
|
|
}
|
|
|
|
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
|
|
|
|
// If we're enabling GP optimizations, use hardware square root
|
|
if (!Subtarget.hasFSQRT() &&
|
|
!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
|
|
Subtarget.hasFRE()))
|
|
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
|
|
|
|
if (!Subtarget.hasFSQRT() &&
|
|
!(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
|
|
Subtarget.hasFRES()))
|
|
setOperationAction(ISD::FSQRT, MVT::f32, Expand);
|
|
|
|
if (Subtarget.hasFCPSGN()) {
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
|
|
} else {
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
|
|
}
|
|
|
|
if (Subtarget.hasFPRND()) {
|
|
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
|
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
|
|
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::f32, Legal);
|
|
}
|
|
|
|
// PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd
|
|
// to speed up scalar BSWAP64.
|
|
// CTPOP or CTTZ were introduced in P8/P9 respectively
|
|
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
|
|
if (Subtarget.hasP9Vector())
|
|
setOperationAction(ISD::BSWAP, MVT::i64 , Custom);
|
|
else
|
|
setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
|
|
if (Subtarget.isISA3_0()) {
|
|
setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
|
|
setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
|
|
} else {
|
|
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
|
|
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
|
|
}
|
|
|
|
if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
|
|
setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
|
|
setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
|
|
} else {
|
|
setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
|
|
setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
|
|
}
|
|
|
|
// PowerPC does not have ROTR
|
|
setOperationAction(ISD::ROTR, MVT::i32 , Expand);
|
|
setOperationAction(ISD::ROTR, MVT::i64 , Expand);
|
|
|
|
if (!Subtarget.useCRBits()) {
|
|
// PowerPC does not have Select
|
|
setOperationAction(ISD::SELECT, MVT::i32, Expand);
|
|
setOperationAction(ISD::SELECT, MVT::i64, Expand);
|
|
setOperationAction(ISD::SELECT, MVT::f32, Expand);
|
|
setOperationAction(ISD::SELECT, MVT::f64, Expand);
|
|
}
|
|
|
|
// PowerPC wants to turn select_cc of FP into fsel when possible.
|
|
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
|
|
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
|
|
|
|
// PowerPC wants to optimize integer setcc a bit
|
|
if (!Subtarget.useCRBits())
|
|
setOperationAction(ISD::SETCC, MVT::i32, Custom);
|
|
|
|
// PowerPC does not have BRCOND which requires SetCC
|
|
if (!Subtarget.useCRBits())
|
|
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
|
|
|
|
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
|
|
|
|
if (Subtarget.hasSPE()) {
|
|
// SPE has built-in conversions
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
|
|
} else {
|
|
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
|
|
|
|
// PowerPC does not have [U|S]INT_TO_FP
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
|
|
}
|
|
|
|
if (Subtarget.hasDirectMove() && isPPC64) {
|
|
setOperationAction(ISD::BITCAST, MVT::f32, Legal);
|
|
setOperationAction(ISD::BITCAST, MVT::i32, Legal);
|
|
setOperationAction(ISD::BITCAST, MVT::i64, Legal);
|
|
setOperationAction(ISD::BITCAST, MVT::f64, Legal);
|
|
if (TM.Options.UnsafeFPMath) {
|
|
setOperationAction(ISD::LRINT, MVT::f64, Legal);
|
|
setOperationAction(ISD::LRINT, MVT::f32, Legal);
|
|
setOperationAction(ISD::LLRINT, MVT::f64, Legal);
|
|
setOperationAction(ISD::LLRINT, MVT::f32, Legal);
|
|
setOperationAction(ISD::LROUND, MVT::f64, Legal);
|
|
setOperationAction(ISD::LROUND, MVT::f32, Legal);
|
|
setOperationAction(ISD::LLROUND, MVT::f64, Legal);
|
|
setOperationAction(ISD::LLROUND, MVT::f32, Legal);
|
|
}
|
|
} else {
|
|
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
|
|
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
|
|
setOperationAction(ISD::BITCAST, MVT::i64, Expand);
|
|
setOperationAction(ISD::BITCAST, MVT::f64, Expand);
|
|
}
|
|
|
|
// We cannot sextinreg(i1). Expand to shifts.
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
|
|
|
|
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
|
|
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
|
|
// support continuation, user-level threading, and etc.. As a result, no
|
|
// other SjLj exception interfaces are implemented and please don't build
|
|
// your own exception handling based on them.
|
|
// LLVM/Clang supports zero-cost DWARF exception handling.
|
|
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
|
|
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
|
|
|
|
// We want to legalize GlobalAddress and ConstantPool nodes into the
|
|
// appropriate instructions to materialize the address.
|
|
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
|
setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
|
|
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
|
|
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
|
|
setOperationAction(ISD::JumpTable, MVT::i32, Custom);
|
|
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
|
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
|
|
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
|
|
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
|
|
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
|
|
|
|
// TRAP is legal.
|
|
setOperationAction(ISD::TRAP, MVT::Other, Legal);
|
|
|
|
// TRAMPOLINE is custom lowered.
|
|
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
|
|
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
|
|
|
|
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
|
|
setOperationAction(ISD::VASTART , MVT::Other, Custom);
|
|
|
|
if (Subtarget.is64BitELFABI()) {
|
|
// VAARG always uses double-word chunks, so promote anything smaller.
|
|
setOperationAction(ISD::VAARG, MVT::i1, Promote);
|
|
AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
|
|
setOperationAction(ISD::VAARG, MVT::i8, Promote);
|
|
AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
|
|
setOperationAction(ISD::VAARG, MVT::i16, Promote);
|
|
AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
|
|
setOperationAction(ISD::VAARG, MVT::i32, Promote);
|
|
AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
|
|
setOperationAction(ISD::VAARG, MVT::Other, Expand);
|
|
} else if (Subtarget.is32BitELFABI()) {
|
|
// VAARG is custom lowered with the 32-bit SVR4 ABI.
|
|
setOperationAction(ISD::VAARG, MVT::Other, Custom);
|
|
setOperationAction(ISD::VAARG, MVT::i64, Custom);
|
|
} else
|
|
setOperationAction(ISD::VAARG, MVT::Other, Expand);
|
|
|
|
// VACOPY is custom lowered with the 32-bit SVR4 ABI.
|
|
if (Subtarget.is32BitELFABI())
|
|
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
|
|
else
|
|
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
|
|
|
|
// Use the default implementation.
|
|
setOperationAction(ISD::VAEND , MVT::Other, Expand);
|
|
setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
|
|
setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
|
|
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
|
|
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
|
|
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
|
|
setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
|
|
setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
|
|
setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
|
|
|
|
// We want to custom lower some of our intrinsics.
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
|
|
|
// To handle counter-based loop conditions.
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
|
|
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
|
|
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
|
|
|
|
// Comparisons that require checking two conditions.
|
|
if (Subtarget.hasSPE()) {
|
|
setCondCodeAction(ISD::SETO, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETO, MVT::f64, Expand);
|
|
setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
|
|
}
|
|
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
|
|
setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
|
|
setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
|
|
setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
|
|
setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
|
|
setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
|
|
setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
|
|
|
|
if (Subtarget.has64BitSupport()) {
|
|
// They also have instructions for converting between i64 and fp.
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
|
|
// This is just the low 32 bits of a (signed) fp->i64 conversion.
|
|
// We cannot do this with Promote because i64 is not a legal type.
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
|
|
|
|
if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
|
|
} else {
|
|
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
|
|
if (Subtarget.hasSPE())
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
|
|
else
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
|
|
}
|
|
|
|
// With the instructions enabled under FPCVT, we can do everything.
|
|
if (Subtarget.hasFPCVT()) {
|
|
if (Subtarget.has64BitSupport()) {
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
|
|
}
|
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
|
|
}
|
|
|
|
if (Subtarget.use64BitRegs()) {
|
|
// 64-bit PowerPC implementations can support i64 types directly
|
|
addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
|
|
// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
|
|
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
|
|
// 64-bit PowerPC wants to expand i128 shifts itself.
|
|
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
|
|
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
|
|
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
|
|
} else {
|
|
// 32-bit PowerPC wants to expand i64 shifts itself.
|
|
setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
|
|
setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
|
|
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
|
|
}
|
|
|
|
if (Subtarget.hasVSX()) {
|
|
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
|
|
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
|
|
setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
|
|
setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
|
|
}
|
|
|
|
if (Subtarget.hasAltivec()) {
|
|
// First set operation action for all vector types to expand. Then we
|
|
// will selectively turn on ones that can be effectively codegen'd.
|
|
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
|
|
// add/sub are legal for all supported vector VT's.
|
|
setOperationAction(ISD::ADD, VT, Legal);
|
|
setOperationAction(ISD::SUB, VT, Legal);
|
|
|
|
// For v2i64, these are only valid with P8Vector. This is corrected after
|
|
// the loop.
|
|
if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
|
|
setOperationAction(ISD::SMAX, VT, Legal);
|
|
setOperationAction(ISD::SMIN, VT, Legal);
|
|
setOperationAction(ISD::UMAX, VT, Legal);
|
|
setOperationAction(ISD::UMIN, VT, Legal);
|
|
}
|
|
else {
|
|
setOperationAction(ISD::SMAX, VT, Expand);
|
|
setOperationAction(ISD::SMIN, VT, Expand);
|
|
setOperationAction(ISD::UMAX, VT, Expand);
|
|
setOperationAction(ISD::UMIN, VT, Expand);
|
|
}
|
|
|
|
if (Subtarget.hasVSX()) {
|
|
setOperationAction(ISD::FMAXNUM, VT, Legal);
|
|
setOperationAction(ISD::FMINNUM, VT, Legal);
|
|
}
|
|
|
|
// Vector instructions introduced in P8
|
|
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
|
|
setOperationAction(ISD::CTPOP, VT, Legal);
|
|
setOperationAction(ISD::CTLZ, VT, Legal);
|
|
}
|
|
else {
|
|
setOperationAction(ISD::CTPOP, VT, Expand);
|
|
setOperationAction(ISD::CTLZ, VT, Expand);
|
|
}
|
|
|
|
// Vector instructions introduced in P9
|
|
if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
|
|
setOperationAction(ISD::CTTZ, VT, Legal);
|
|
else
|
|
setOperationAction(ISD::CTTZ, VT, Expand);
|
|
|
|
// We promote all shuffles to v16i8.
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
|
|
AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
|
|
|
|
// We promote all non-typed operations to v4i32.
|
|
setOperationAction(ISD::AND , VT, Promote);
|
|
AddPromotedToType (ISD::AND , VT, MVT::v4i32);
|
|
setOperationAction(ISD::OR , VT, Promote);
|
|
AddPromotedToType (ISD::OR , VT, MVT::v4i32);
|
|
setOperationAction(ISD::XOR , VT, Promote);
|
|
AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
|
|
setOperationAction(ISD::LOAD , VT, Promote);
|
|
AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
|
|
setOperationAction(ISD::SELECT, VT, Promote);
|
|
AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
|
|
setOperationAction(ISD::VSELECT, VT, Legal);
|
|
setOperationAction(ISD::SELECT_CC, VT, Promote);
|
|
AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
|
|
setOperationAction(ISD::STORE, VT, Promote);
|
|
AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
|
|
|
|
// No other operations are legal.
|
|
setOperationAction(ISD::MUL , VT, Expand);
|
|
setOperationAction(ISD::SDIV, VT, Expand);
|
|
setOperationAction(ISD::SREM, VT, Expand);
|
|
setOperationAction(ISD::UDIV, VT, Expand);
|
|
setOperationAction(ISD::UREM, VT, Expand);
|
|
setOperationAction(ISD::FDIV, VT, Expand);
|
|
setOperationAction(ISD::FREM, VT, Expand);
|
|
setOperationAction(ISD::FNEG, VT, Expand);
|
|
setOperationAction(ISD::FSQRT, VT, Expand);
|
|
setOperationAction(ISD::FLOG, VT, Expand);
|
|
setOperationAction(ISD::FLOG10, VT, Expand);
|
|
setOperationAction(ISD::FLOG2, VT, Expand);
|
|
setOperationAction(ISD::FEXP, VT, Expand);
|
|
setOperationAction(ISD::FEXP2, VT, Expand);
|
|
setOperationAction(ISD::FSIN, VT, Expand);
|
|
setOperationAction(ISD::FCOS, VT, Expand);
|
|
setOperationAction(ISD::FABS, VT, Expand);
|
|
setOperationAction(ISD::FFLOOR, VT, Expand);
|
|
setOperationAction(ISD::FCEIL, VT, Expand);
|
|
setOperationAction(ISD::FTRUNC, VT, Expand);
|
|
setOperationAction(ISD::FRINT, VT, Expand);
|
|
setOperationAction(ISD::FNEARBYINT, VT, Expand);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
|
|
setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
|
|
setOperationAction(ISD::MULHU, VT, Expand);
|
|
setOperationAction(ISD::MULHS, VT, Expand);
|
|
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
|
|
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
|
|
setOperationAction(ISD::UDIVREM, VT, Expand);
|
|
setOperationAction(ISD::SDIVREM, VT, Expand);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
|
|
setOperationAction(ISD::FPOW, VT, Expand);
|
|
setOperationAction(ISD::BSWAP, VT, Expand);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
|
|
setOperationAction(ISD::ROTL, VT, Expand);
|
|
setOperationAction(ISD::ROTR, VT, Expand);
|
|
|
|
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
|
|
setTruncStoreAction(VT, InnerVT, Expand);
|
|
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
|
|
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
|
|
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
|
|
}
|
|
}
|
|
if (!Subtarget.hasP8Vector()) {
|
|
setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
|
|
setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
|
|
setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
|
|
setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
|
|
}
|
|
|
|
for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8})
|
|
setOperationAction(ISD::ABS, VT, Custom);
|
|
|
|
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
|
|
// with merges, splats, etc.
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
|
|
|
|
// Vector truncates to sub-word integer that fit in an Altivec/VSX register
|
|
// are cheap, so handle them before they get expanded to scalar.
|
|
setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
|
|
setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
|
|
setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
|
|
setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
|
|
setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::AND , MVT::v4i32, Legal);
|
|
setOperationAction(ISD::OR , MVT::v4i32, Legal);
|
|
setOperationAction(ISD::XOR , MVT::v4i32, Legal);
|
|
setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
|
|
setOperationAction(ISD::SELECT, MVT::v4i32,
|
|
Subtarget.useCRBits() ? Legal : Expand);
|
|
setOperationAction(ISD::STORE , MVT::v4i32, Legal);
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
|
|
|
|
// Without hasP8Altivec set, v2i64 SMAX isn't available.
|
|
// But ABS custom lowering requires SMAX support.
|
|
if (!Subtarget.hasP8Altivec())
|
|
setOperationAction(ISD::ABS, MVT::v2i64, Expand);
|
|
|
|
// With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
|
|
if (Subtarget.hasAltivec())
|
|
for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
|
|
setOperationAction(ISD::ROTL, VT, Legal);
|
|
// With hasP8Altivec set, we can lower ISD::ROTL to vrld.
|
|
if (Subtarget.hasP8Altivec())
|
|
setOperationAction(ISD::ROTL, MVT::v2i64, Legal);
|
|
|
|
addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
|
|
addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
|
|
addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
|
|
addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
|
|
|
|
setOperationAction(ISD::MUL, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FMA, MVT::v4f32, Legal);
|
|
|
|
if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
|
|
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
|
|
}
|
|
|
|
if (Subtarget.hasP8Altivec())
|
|
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
|
|
else
|
|
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
|
|
|
|
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
|
|
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
|
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
|
|
|
|
// Altivec does not contain unordered floating-point compare instructions
|
|
setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
|
|
setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
|
|
setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
|
|
setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
|
|
|
|
if (Subtarget.hasVSX()) {
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
|
|
if (Subtarget.hasP8Vector()) {
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
|
|
}
|
|
if (Subtarget.hasDirectMove() && isPPC64) {
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
|
|
}
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
|
|
|
|
// The nearbyint variants are not allowed to raise the inexact exception
|
|
// so we can only code-gen them with unsafe math.
|
|
if (TM.Options.UnsafeFPMath) {
|
|
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
|
|
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
|
|
}
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::f32, Legal);
|
|
|
|
setOperationAction(ISD::MUL, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FMA, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
|
|
|
|
// Share the Altivec comparison restrictions.
|
|
setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
|
|
setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
|
|
setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
|
|
setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
|
|
|
|
setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::STORE, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
|
|
|
|
if (Subtarget.hasP8Vector())
|
|
addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
|
|
|
|
addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
|
|
|
|
addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
|
|
addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
|
|
addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
|
|
|
|
if (Subtarget.hasP8Altivec()) {
|
|
setOperationAction(ISD::SHL, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::SRA, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::SRL, MVT::v2i64, Legal);
|
|
|
|
// 128 bit shifts can be accomplished via 3 instructions for SHL and
|
|
// SRL, but not for SRA because of the instructions available:
|
|
// VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
|
|
// doing
|
|
setOperationAction(ISD::SHL, MVT::v1i128, Expand);
|
|
setOperationAction(ISD::SRL, MVT::v1i128, Expand);
|
|
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
|
|
|
|
setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
|
|
}
|
|
else {
|
|
setOperationAction(ISD::SHL, MVT::v2i64, Expand);
|
|
setOperationAction(ISD::SRA, MVT::v2i64, Expand);
|
|
setOperationAction(ISD::SRL, MVT::v2i64, Expand);
|
|
|
|
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
|
|
|
|
// VSX v2i64 only supports non-arithmetic operations.
|
|
setOperationAction(ISD::ADD, MVT::v2i64, Expand);
|
|
setOperationAction(ISD::SUB, MVT::v2i64, Expand);
|
|
}
|
|
|
|
setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
|
|
AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
|
|
setOperationAction(ISD::STORE, MVT::v2i64, Promote);
|
|
AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
|
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
|
|
|
|
// Custom handling for partial vectors of integers converted to
|
|
// floating point. We already have optimal handling for v2i32 through
|
|
// the DAG combine, so those aren't necessary.
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
|
|
|
|
setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FABS, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FABS, MVT::v2f64, Legal);
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
|
|
|
|
if (Subtarget.hasDirectMove())
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
|
|
|
|
addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
|
|
}
|
|
|
|
if (Subtarget.hasP8Altivec()) {
|
|
addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
|
|
addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
|
|
}
|
|
|
|
if (Subtarget.hasP9Vector()) {
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
|
|
|
|
// 128 bit shifts can be accomplished via 3 instructions for SHL and
|
|
// SRL, but not for SRA because of the instructions available:
|
|
// VS{RL} and VS{RL}O.
|
|
setOperationAction(ISD::SHL, MVT::v1i128, Legal);
|
|
setOperationAction(ISD::SRL, MVT::v1i128, Legal);
|
|
setOperationAction(ISD::SRA, MVT::v1i128, Expand);
|
|
|
|
if (EnableQuadPrecision) {
|
|
addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
|
|
setOperationAction(ISD::FADD, MVT::f128, Legal);
|
|
setOperationAction(ISD::FSUB, MVT::f128, Legal);
|
|
setOperationAction(ISD::FDIV, MVT::f128, Legal);
|
|
setOperationAction(ISD::FMUL, MVT::f128, Legal);
|
|
setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
|
|
// No extending loads to f128 on PPC.
|
|
for (MVT FPT : MVT::fp_valuetypes())
|
|
setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
|
|
setOperationAction(ISD::FMA, MVT::f128, Legal);
|
|
setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
|
|
setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
|
|
setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
|
|
setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
|
|
setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
|
|
setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
|
|
|
|
setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
|
|
setOperationAction(ISD::FRINT, MVT::f128, Legal);
|
|
setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::f128, Legal);
|
|
setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::f128, Legal);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::f128, Expand);
|
|
setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
|
|
setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
|
|
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
|
|
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
|
|
setOperationAction(ISD::BITCAST, MVT::i128, Custom);
|
|
// No implementation for these ops for PowerPC.
|
|
setOperationAction(ISD::FSIN , MVT::f128, Expand);
|
|
setOperationAction(ISD::FCOS , MVT::f128, Expand);
|
|
setOperationAction(ISD::FPOW, MVT::f128, Expand);
|
|
setOperationAction(ISD::FPOWI, MVT::f128, Expand);
|
|
setOperationAction(ISD::FREM, MVT::f128, Expand);
|
|
}
|
|
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
|
|
setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
|
|
setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
|
|
setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
|
|
}
|
|
|
|
if (Subtarget.hasP9Altivec()) {
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
|
|
}
|
|
}
|
|
|
|
if (Subtarget.hasQPX()) {
|
|
setOperationAction(ISD::FADD, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FREM, MVT::v4f64, Expand);
|
|
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand);
|
|
|
|
setOperationAction(ISD::LOAD , MVT::v4f64, Custom);
|
|
setOperationAction(ISD::STORE , MVT::v4f64, Custom);
|
|
|
|
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
|
|
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom);
|
|
|
|
if (!Subtarget.useCRBits())
|
|
setOperationAction(ISD::SELECT, MVT::v4f64, Expand);
|
|
setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal);
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
|
|
|
|
setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand);
|
|
|
|
setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);
|
|
|
|
setOperationAction(ISD::FNEG , MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FABS , MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FEXP , MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand);
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal);
|
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal);
|
|
|
|
addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass);
|
|
|
|
setOperationAction(ISD::FADD, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FREM, MVT::v4f32, Expand);
|
|
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand);
|
|
|
|
setOperationAction(ISD::LOAD , MVT::v4f32, Custom);
|
|
setOperationAction(ISD::STORE , MVT::v4f32, Custom);
|
|
|
|
if (!Subtarget.useCRBits())
|
|
setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
|
|
setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal);
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
|
|
|
|
setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand);
|
|
|
|
setOperationAction(ISD::FNEG , MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FABS , MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FEXP , MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand);
|
|
|
|
setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
|
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal);
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal);
|
|
|
|
addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass);
|
|
|
|
setOperationAction(ISD::AND , MVT::v4i1, Legal);
|
|
setOperationAction(ISD::OR , MVT::v4i1, Legal);
|
|
setOperationAction(ISD::XOR , MVT::v4i1, Legal);
|
|
|
|
if (!Subtarget.useCRBits())
|
|
setOperationAction(ISD::SELECT, MVT::v4i1, Expand);
|
|
setOperationAction(ISD::VSELECT, MVT::v4i1, Legal);
|
|
|
|
setOperationAction(ISD::LOAD , MVT::v4i1, Custom);
|
|
setOperationAction(ISD::STORE , MVT::v4i1, Custom);
|
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom);
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand);
|
|
setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand);
|
|
setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand);
|
|
setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom);
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand);
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
|
|
|
|
addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass);
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::v4f64, Legal);
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
|
|
|
|
// These need to set FE_INEXACT, and so cannot be vectorized here.
|
|
setOperationAction(ISD::FRINT, MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
|
|
|
|
if (TM.Options.UnsafeFPMath) {
|
|
setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
|
|
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
|
|
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
|
|
} else {
|
|
setOperationAction(ISD::FDIV, MVT::v4f64, Expand);
|
|
setOperationAction(ISD::FSQRT, MVT::v4f64, Expand);
|
|
|
|
setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
|
|
setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
|
|
}
|
|
}
|
|
|
|
if (Subtarget.has64BitSupport())
|
|
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
|
|
|
|
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
|
|
|
|
if (!isPPC64) {
|
|
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
|
|
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
|
|
}
|
|
|
|
setBooleanContents(ZeroOrOneBooleanContent);
|
|
|
|
if (Subtarget.hasAltivec()) {
|
|
// Altivec instructions set fields to all zeros or all ones.
|
|
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
|
|
}
|
|
|
|
if (!isPPC64) {
|
|
// These libcalls are not available in 32-bit.
|
|
setLibcallName(RTLIB::SHL_I128, nullptr);
|
|
setLibcallName(RTLIB::SRL_I128, nullptr);
|
|
setLibcallName(RTLIB::SRA_I128, nullptr);
|
|
}
|
|
|
|
setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
|
|
|
|
// We have target-specific dag combine patterns for the following nodes:
|
|
setTargetDAGCombine(ISD::ADD);
|
|
setTargetDAGCombine(ISD::SHL);
|
|
setTargetDAGCombine(ISD::SRA);
|
|
setTargetDAGCombine(ISD::SRL);
|
|
setTargetDAGCombine(ISD::MUL);
|
|
setTargetDAGCombine(ISD::SINT_TO_FP);
|
|
setTargetDAGCombine(ISD::BUILD_VECTOR);
|
|
if (Subtarget.hasFPCVT())
|
|
setTargetDAGCombine(ISD::UINT_TO_FP);
|
|
setTargetDAGCombine(ISD::LOAD);
|
|
setTargetDAGCombine(ISD::STORE);
|
|
setTargetDAGCombine(ISD::BR_CC);
|
|
if (Subtarget.useCRBits())
|
|
setTargetDAGCombine(ISD::BRCOND);
|
|
setTargetDAGCombine(ISD::BSWAP);
|
|
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
|
|
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
|
|
setTargetDAGCombine(ISD::INTRINSIC_VOID);
|
|
|
|
setTargetDAGCombine(ISD::SIGN_EXTEND);
|
|
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
|
setTargetDAGCombine(ISD::ANY_EXTEND);
|
|
|
|
setTargetDAGCombine(ISD::TRUNCATE);
|
|
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
|
|
|
|
|
|
if (Subtarget.useCRBits()) {
|
|
setTargetDAGCombine(ISD::TRUNCATE);
|
|
setTargetDAGCombine(ISD::SETCC);
|
|
setTargetDAGCombine(ISD::SELECT_CC);
|
|
}
|
|
|
|
// Use reciprocal estimates.
|
|
if (TM.Options.UnsafeFPMath) {
|
|
setTargetDAGCombine(ISD::FDIV);
|
|
setTargetDAGCombine(ISD::FSQRT);
|
|
}
|
|
|
|
if (Subtarget.hasP9Altivec()) {
|
|
setTargetDAGCombine(ISD::ABS);
|
|
setTargetDAGCombine(ISD::VSELECT);
|
|
}
|
|
|
|
// Darwin long double math library functions have $LDBL128 appended.
|
|
if (Subtarget.isDarwin()) {
|
|
setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
|
|
setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
|
|
setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
|
|
setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
|
|
setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
|
|
setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
|
|
setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
|
|
setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
|
|
setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
|
|
setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
|
|
}
|
|
|
|
if (EnableQuadPrecision) {
|
|
setLibcallName(RTLIB::LOG_F128, "logf128");
|
|
setLibcallName(RTLIB::LOG2_F128, "log2f128");
|
|
setLibcallName(RTLIB::LOG10_F128, "log10f128");
|
|
setLibcallName(RTLIB::EXP_F128, "expf128");
|
|
setLibcallName(RTLIB::EXP2_F128, "exp2f128");
|
|
setLibcallName(RTLIB::SIN_F128, "sinf128");
|
|
setLibcallName(RTLIB::COS_F128, "cosf128");
|
|
setLibcallName(RTLIB::POW_F128, "powf128");
|
|
setLibcallName(RTLIB::FMIN_F128, "fminf128");
|
|
setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
|
|
setLibcallName(RTLIB::POWI_F128, "__powikf2");
|
|
setLibcallName(RTLIB::REM_F128, "fmodf128");
|
|
}
|
|
|
|
// With 32 condition bits, we don't need to sink (and duplicate) compares
|
|
// aggressively in CodeGenPrep.
|
|
if (Subtarget.useCRBits()) {
|
|
setHasMultipleConditionRegisters();
|
|
setJumpIsExpensive();
|
|
}
|
|
|
|
setMinFunctionAlignment(Align(4));
|
|
if (Subtarget.isDarwin())
|
|
setPrefFunctionAlignment(Align(16));
|
|
|
|
switch (Subtarget.getCPUDirective()) {
|
|
default: break;
|
|
case PPC::DIR_970:
|
|
case PPC::DIR_A2:
|
|
case PPC::DIR_E500:
|
|
case PPC::DIR_E500mc:
|
|
case PPC::DIR_E5500:
|
|
case PPC::DIR_PWR4:
|
|
case PPC::DIR_PWR5:
|
|
case PPC::DIR_PWR5X:
|
|
case PPC::DIR_PWR6:
|
|
case PPC::DIR_PWR6X:
|
|
case PPC::DIR_PWR7:
|
|
case PPC::DIR_PWR8:
|
|
case PPC::DIR_PWR9:
|
|
case PPC::DIR_PWR_FUTURE:
|
|
setPrefLoopAlignment(Align(16));
|
|
setPrefFunctionAlignment(Align(16));
|
|
break;
|
|
}
|
|
|
|
if (Subtarget.enableMachineScheduler())
|
|
setSchedulingPreference(Sched::Source);
|
|
else
|
|
setSchedulingPreference(Sched::Hybrid);
|
|
|
|
computeRegisterProperties(STI.getRegisterInfo());
|
|
|
|
// The Freescale cores do better with aggressive inlining of memcpy and
|
|
// friends. GCC uses same threshold of 128 bytes (= 32 word stores).
|
|
if (Subtarget.getCPUDirective() == PPC::DIR_E500mc ||
|
|
Subtarget.getCPUDirective() == PPC::DIR_E5500) {
|
|
MaxStoresPerMemset = 32;
|
|
MaxStoresPerMemsetOptSize = 16;
|
|
MaxStoresPerMemcpy = 32;
|
|
MaxStoresPerMemcpyOptSize = 8;
|
|
MaxStoresPerMemmove = 32;
|
|
MaxStoresPerMemmoveOptSize = 8;
|
|
} else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
|
|
// The A2 also benefits from (very) aggressive inlining of memcpy and
|
|
// friends. The overhead of a the function call, even when warm, can be
|
|
// over one hundred cycles.
|
|
MaxStoresPerMemset = 128;
|
|
MaxStoresPerMemcpy = 128;
|
|
MaxStoresPerMemmove = 128;
|
|
MaxLoadsPerMemcmp = 128;
|
|
} else {
|
|
MaxLoadsPerMemcmp = 8;
|
|
MaxLoadsPerMemcmpOptSize = 4;
|
|
}
|
|
}
|
|
|
|
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
|
|
/// the desired ByVal argument alignment.
|
|
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
|
|
unsigned MaxMaxAlign) {
|
|
if (MaxAlign == MaxMaxAlign)
|
|
return;
|
|
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
|
|
if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
|
|
MaxAlign = 32;
|
|
else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
|
|
MaxAlign = 16;
|
|
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
|
|
unsigned EltAlign = 0;
|
|
getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
|
|
if (EltAlign > MaxAlign)
|
|
MaxAlign = EltAlign;
|
|
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
|
|
for (auto *EltTy : STy->elements()) {
|
|
unsigned EltAlign = 0;
|
|
getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
|
|
if (EltAlign > MaxAlign)
|
|
MaxAlign = EltAlign;
|
|
if (MaxAlign == MaxMaxAlign)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
|
|
/// function arguments in the caller parameter area.
|
|
unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
|
|
const DataLayout &DL) const {
|
|
// Darwin passes everything on 4 byte boundary.
|
|
if (Subtarget.isDarwin())
|
|
return 4;
|
|
|
|
// 16byte and wider vectors are passed on 16byte boundary.
|
|
// The rest is 8 on PPC64 and 4 on PPC32 boundary.
|
|
unsigned Align = Subtarget.isPPC64() ? 8 : 4;
|
|
if (Subtarget.hasAltivec() || Subtarget.hasQPX())
|
|
getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
|
|
return Align;
|
|
}
|
|
|
|
bool PPCTargetLowering::useSoftFloat() const {
|
|
return Subtarget.useSoftFloat();
|
|
}
|
|
|
|
bool PPCTargetLowering::hasSPE() const {
|
|
return Subtarget.hasSPE();
|
|
}
|
|
|
|
bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
|
|
return VT.isScalarInteger();
|
|
}
|
|
|
|
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|
switch ((PPCISD::NodeType)Opcode) {
|
|
case PPCISD::FIRST_NUMBER: break;
|
|
case PPCISD::FSEL: return "PPCISD::FSEL";
|
|
case PPCISD::XSMAXCDP: return "PPCISD::XSMAXCDP";
|
|
case PPCISD::XSMINCDP: return "PPCISD::XSMINCDP";
|
|
case PPCISD::FCFID: return "PPCISD::FCFID";
|
|
case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
|
|
case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
|
|
case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
|
|
case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
|
|
case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
|
|
case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
|
|
case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
|
|
case PPCISD::FP_TO_UINT_IN_VSR:
|
|
return "PPCISD::FP_TO_UINT_IN_VSR,";
|
|
case PPCISD::FP_TO_SINT_IN_VSR:
|
|
return "PPCISD::FP_TO_SINT_IN_VSR";
|
|
case PPCISD::FRE: return "PPCISD::FRE";
|
|
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
|
|
case PPCISD::STFIWX: return "PPCISD::STFIWX";
|
|
case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
|
|
case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
|
|
case PPCISD::VPERM: return "PPCISD::VPERM";
|
|
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
|
|
case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
|
|
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
|
|
case PPCISD::VECSHL: return "PPCISD::VECSHL";
|
|
case PPCISD::CMPB: return "PPCISD::CMPB";
|
|
case PPCISD::Hi: return "PPCISD::Hi";
|
|
case PPCISD::Lo: return "PPCISD::Lo";
|
|
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
|
|
case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
|
|
case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
|
|
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
|
|
case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
|
|
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
|
|
case PPCISD::SRL: return "PPCISD::SRL";
|
|
case PPCISD::SRA: return "PPCISD::SRA";
|
|
case PPCISD::SHL: return "PPCISD::SHL";
|
|
case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
|
|
case PPCISD::CALL: return "PPCISD::CALL";
|
|
case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
|
|
case PPCISD::MTCTR: return "PPCISD::MTCTR";
|
|
case PPCISD::BCTRL: return "PPCISD::BCTRL";
|
|
case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
|
|
case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
|
|
case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
|
|
case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
|
|
case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
|
|
case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
|
|
case PPCISD::MFVSR: return "PPCISD::MFVSR";
|
|
case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
|
|
case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
|
|
case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
|
|
case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
|
|
case PPCISD::ANDI_rec_1_EQ_BIT:
|
|
return "PPCISD::ANDI_rec_1_EQ_BIT";
|
|
case PPCISD::ANDI_rec_1_GT_BIT:
|
|
return "PPCISD::ANDI_rec_1_GT_BIT";
|
|
case PPCISD::VCMP: return "PPCISD::VCMP";
|
|
case PPCISD::VCMPo: return "PPCISD::VCMPo";
|
|
case PPCISD::LBRX: return "PPCISD::LBRX";
|
|
case PPCISD::STBRX: return "PPCISD::STBRX";
|
|
case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
|
|
case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
|
|
case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
|
|
case PPCISD::STXSIX: return "PPCISD::STXSIX";
|
|
case PPCISD::VEXTS: return "PPCISD::VEXTS";
|
|
case PPCISD::SExtVElems: return "PPCISD::SExtVElems";
|
|
case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
|
|
case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
|
|
case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
|
|
case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
|
|
case PPCISD::ST_VSR_SCAL_INT:
|
|
return "PPCISD::ST_VSR_SCAL_INT";
|
|
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
|
|
case PPCISD::BDNZ: return "PPCISD::BDNZ";
|
|
case PPCISD::BDZ: return "PPCISD::BDZ";
|
|
case PPCISD::MFFS: return "PPCISD::MFFS";
|
|
case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
|
|
case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
|
|
case PPCISD::CR6SET: return "PPCISD::CR6SET";
|
|
case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
|
|
case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
|
|
case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
|
|
case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
|
|
case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
|
|
case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
|
|
case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
|
|
case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
|
|
case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
|
|
case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
|
|
case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
|
|
case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
|
|
case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
|
|
case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
|
|
case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
|
|
case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
|
|
case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
|
|
case PPCISD::SC: return "PPCISD::SC";
|
|
case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
|
|
case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
|
|
case PPCISD::RFEBB: return "PPCISD::RFEBB";
|
|
case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
|
|
case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
|
|
case PPCISD::VABSD: return "PPCISD::VABSD";
|
|
case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
|
|
case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
|
|
case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
|
|
case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI";
|
|
case PPCISD::QBFLT: return "PPCISD::QBFLT";
|
|
case PPCISD::QVLFSb: return "PPCISD::QVLFSb";
|
|
case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
|
|
case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
|
|
case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
|
|
case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
|
|
case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
|
|
case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
|
|
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
|
|
EVT VT) const {
|
|
if (!VT.isVector())
|
|
return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
|
|
|
|
if (Subtarget.hasQPX())
|
|
return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
|
|
|
|
return VT.changeVectorElementTypeToInteger();
|
|
}
|
|
|
|
bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
|
|
assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
|
|
return true;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Node matching predicates, for use by the tblgen matching code.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
|
|
static bool isFloatingPointZero(SDValue Op) {
|
|
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
|
|
return CFP->getValueAPF().isZero();
|
|
else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
|
|
// Maybe this has already been legalized into the constant pool?
|
|
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
|
|
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
|
|
return CFP->getValueAPF().isZero();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
|
|
/// true if Op is undef or if it matches the specified value.
|
|
static bool isConstantOrUndef(int Op, int Val) {
|
|
return Op < 0 || Op == Val;
|
|
}
|
|
|
|
/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
|
|
/// VPKUHUM instruction.
|
|
/// The ShuffleKind distinguishes between big-endian operations with
|
|
/// two different inputs (0), either-endian operations with two identical
|
|
/// inputs (1), and little-endian operations with two different inputs (2).
|
|
/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
|
|
bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
|
|
SelectionDAG &DAG) {
|
|
bool IsLE = DAG.getDataLayout().isLittleEndian();
|
|
if (ShuffleKind == 0) {
|
|
if (IsLE)
|
|
return false;
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
|
|
return false;
|
|
} else if (ShuffleKind == 2) {
|
|
if (!IsLE)
|
|
return false;
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
if (!isConstantOrUndef(N->getMaskElt(i), i*2))
|
|
return false;
|
|
} else if (ShuffleKind == 1) {
|
|
unsigned j = IsLE ? 0 : 1;
|
|
for (unsigned i = 0; i != 8; ++i)
|
|
if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
|
|
/// VPKUWUM instruction.
|
|
/// The ShuffleKind distinguishes between big-endian operations with
|
|
/// two different inputs (0), either-endian operations with two identical
|
|
/// inputs (1), and little-endian operations with two different inputs (2).
|
|
/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
|
|
bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
|
|
SelectionDAG &DAG) {
|
|
bool IsLE = DAG.getDataLayout().isLittleEndian();
|
|
if (ShuffleKind == 0) {
|
|
if (IsLE)
|
|
return false;
|
|
for (unsigned i = 0; i != 16; i += 2)
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
|
|
return false;
|
|
} else if (ShuffleKind == 2) {
|
|
if (!IsLE)
|
|
return false;
|
|
for (unsigned i = 0; i != 16; i += 2)
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
|
|
return false;
|
|
} else if (ShuffleKind == 1) {
|
|
unsigned j = IsLE ? 0 : 2;
|
|
for (unsigned i = 0; i != 8; i += 2)
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
|
|
/// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
|
|
/// current subtarget.
|
|
///
|
|
/// The ShuffleKind distinguishes between big-endian operations with
|
|
/// two different inputs (0), either-endian operations with two identical
|
|
/// inputs (1), and little-endian operations with two different inputs (2).
|
|
/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
|
|
bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
|
|
SelectionDAG &DAG) {
|
|
const PPCSubtarget& Subtarget =
|
|
static_cast<const PPCSubtarget&>(DAG.getSubtarget());
|
|
if (!Subtarget.hasP8Vector())
|
|
return false;
|
|
|
|
bool IsLE = DAG.getDataLayout().isLittleEndian();
|
|
if (ShuffleKind == 0) {
|
|
if (IsLE)
|
|
return false;
|
|
for (unsigned i = 0; i != 16; i += 4)
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+5) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+2), i*2+6) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
|
|
return false;
|
|
} else if (ShuffleKind == 2) {
|
|
if (!IsLE)
|
|
return false;
|
|
for (unsigned i = 0; i != 16; i += 4)
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+1) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+2), i*2+2) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
|
|
return false;
|
|
} else if (ShuffleKind == 1) {
|
|
unsigned j = IsLE ? 0 : 4;
|
|
for (unsigned i = 0; i != 8; i += 4)
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
|
|
!isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// isVMerge - Common function, used to match vmrg* shuffles.
|
|
///
|
|
static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
|
|
unsigned LHSStart, unsigned RHSStart) {
|
|
if (N->getValueType(0) != MVT::v16i8)
|
|
return false;
|
|
assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
|
|
"Unsupported merge size!");
|
|
|
|
for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
|
|
for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
|
|
if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
|
|
LHSStart+j+i*UnitSize) ||
|
|
!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
|
|
RHSStart+j+i*UnitSize))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
|
|
/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
|
|
/// The ShuffleKind distinguishes between big-endian merges with two
|
|
/// different inputs (0), either-endian merges with two identical inputs (1),
|
|
/// and little-endian merges with two different inputs (2). For the latter,
|
|
/// the input operands are swapped (see PPCInstrAltivec.td).
|
|
bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
|
|
unsigned ShuffleKind, SelectionDAG &DAG) {
|
|
if (DAG.getDataLayout().isLittleEndian()) {
|
|
if (ShuffleKind == 1) // unary
|
|
return isVMerge(N, UnitSize, 0, 0);
|
|
else if (ShuffleKind == 2) // swapped
|
|
return isVMerge(N, UnitSize, 0, 16);
|
|
else
|
|
return false;
|
|
} else {
|
|
if (ShuffleKind == 1) // unary
|
|
return isVMerge(N, UnitSize, 8, 8);
|
|
else if (ShuffleKind == 0) // normal
|
|
return isVMerge(N, UnitSize, 8, 24);
|
|
else
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
|
|
/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
|
|
/// The ShuffleKind distinguishes between big-endian merges with two
|
|
/// different inputs (0), either-endian merges with two identical inputs (1),
|
|
/// and little-endian merges with two different inputs (2). For the latter,
|
|
/// the input operands are swapped (see PPCInstrAltivec.td).
|
|
bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
|
|
unsigned ShuffleKind, SelectionDAG &DAG) {
|
|
if (DAG.getDataLayout().isLittleEndian()) {
|
|
if (ShuffleKind == 1) // unary
|
|
return isVMerge(N, UnitSize, 8, 8);
|
|
else if (ShuffleKind == 2) // swapped
|
|
return isVMerge(N, UnitSize, 8, 24);
|
|
else
|
|
return false;
|
|
} else {
|
|
if (ShuffleKind == 1) // unary
|
|
return isVMerge(N, UnitSize, 0, 0);
|
|
else if (ShuffleKind == 0) // normal
|
|
return isVMerge(N, UnitSize, 0, 16);
|
|
else
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Common function used to match vmrgew and vmrgow shuffles
|
|
*
|
|
* The indexOffset determines whether to look for even or odd words in
|
|
* the shuffle mask. This is based on the of the endianness of the target
|
|
* machine.
|
|
* - Little Endian:
|
|
* - Use offset of 0 to check for odd elements
|
|
* - Use offset of 4 to check for even elements
|
|
* - Big Endian:
|
|
* - Use offset of 0 to check for even elements
|
|
* - Use offset of 4 to check for odd elements
|
|
* A detailed description of the vector element ordering for little endian and
|
|
* big endian can be found at
|
|
* http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
|
|
* Targeting your applications - what little endian and big endian IBM XL C/C++
|
|
* compiler differences mean to you
|
|
*
|
|
* The mask to the shuffle vector instruction specifies the indices of the
|
|
* elements from the two input vectors to place in the result. The elements are
|
|
* numbered in array-access order, starting with the first vector. These vectors
|
|
* are always of type v16i8, thus each vector will contain 16 elements of size
|
|
* 8. More info on the shuffle vector can be found in the
|
|
* http://llvm.org/docs/LangRef.html#shufflevector-instruction
|
|
* Language Reference.
|
|
*
|
|
* The RHSStartValue indicates whether the same input vectors are used (unary)
|
|
* or two different input vectors are used, based on the following:
|
|
* - If the instruction uses the same vector for both inputs, the range of the
|
|
* indices will be 0 to 15. In this case, the RHSStart value passed should
|
|
* be 0.
|
|
* - If the instruction has two different vectors then the range of the
|
|
* indices will be 0 to 31. In this case, the RHSStart value passed should
|
|
* be 16 (indices 0-15 specify elements in the first vector while indices 16
|
|
* to 31 specify elements in the second vector).
|
|
*
|
|
* \param[in] N The shuffle vector SD Node to analyze
|
|
* \param[in] IndexOffset Specifies whether to look for even or odd elements
|
|
* \param[in] RHSStartValue Specifies the starting index for the righthand input
|
|
* vector to the shuffle_vector instruction
|
|
* \return true iff this shuffle vector represents an even or odd word merge
|
|
*/
|
|
static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
|
|
unsigned RHSStartValue) {
|
|
if (N->getValueType(0) != MVT::v16i8)
|
|
return false;
|
|
|
|
for (unsigned i = 0; i < 2; ++i)
|
|
for (unsigned j = 0; j < 4; ++j)
|
|
if (!isConstantOrUndef(N->getMaskElt(i*4+j),
|
|
i*RHSStartValue+j+IndexOffset) ||
|
|
!isConstantOrUndef(N->getMaskElt(i*4+j+8),
|
|
i*RHSStartValue+j+IndexOffset+8))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Determine if the specified shuffle mask is suitable for the vmrgew or
|
|
* vmrgow instructions.
|
|
*
|
|
* \param[in] N The shuffle vector SD Node to analyze
|
|
* \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
|
|
* \param[in] ShuffleKind Identify the type of merge:
|
|
* - 0 = big-endian merge with two different inputs;
|
|
* - 1 = either-endian merge with two identical inputs;
|
|
* - 2 = little-endian merge with two different inputs (inputs are swapped for
|
|
* little-endian merges).
|
|
* \param[in] DAG The current SelectionDAG
|
|
* \return true iff this shuffle mask
|
|
*/
|
|
bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
|
|
unsigned ShuffleKind, SelectionDAG &DAG) {
|
|
if (DAG.getDataLayout().isLittleEndian()) {
|
|
unsigned indexOffset = CheckEven ? 4 : 0;
|
|
if (ShuffleKind == 1) // Unary
|
|
return isVMerge(N, indexOffset, 0);
|
|
else if (ShuffleKind == 2) // swapped
|
|
return isVMerge(N, indexOffset, 16);
|
|
else
|
|
return false;
|
|
}
|
|
else {
|
|
unsigned indexOffset = CheckEven ? 0 : 4;
|
|
if (ShuffleKind == 1) // Unary
|
|
return isVMerge(N, indexOffset, 0);
|
|
else if (ShuffleKind == 0) // Normal
|
|
return isVMerge(N, indexOffset, 16);
|
|
else
|
|
return false;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
|
|
/// amount, otherwise return -1.
|
|
/// The ShuffleKind distinguishes between big-endian operations with two
|
|
/// different inputs (0), either-endian operations with two identical inputs
|
|
/// (1), and little-endian operations with two different inputs (2). For the
|
|
/// latter, the input operands are swapped (see PPCInstrAltivec.td).
|
|
int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
|
|
SelectionDAG &DAG) {
|
|
if (N->getValueType(0) != MVT::v16i8)
|
|
return -1;
|
|
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
|
|
|
|
// Find the first non-undef value in the shuffle mask.
|
|
unsigned i;
|
|
for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
|
|
/*search*/;
|
|
|
|
if (i == 16) return -1; // all undef.
|
|
|
|
// Otherwise, check to see if the rest of the elements are consecutively
|
|
// numbered from this value.
|
|
unsigned ShiftAmt = SVOp->getMaskElt(i);
|
|
if (ShiftAmt < i) return -1;
|
|
|
|
ShiftAmt -= i;
|
|
bool isLE = DAG.getDataLayout().isLittleEndian();
|
|
|
|
if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
for (++i; i != 16; ++i)
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
|
|
return -1;
|
|
} else if (ShuffleKind == 1) {
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
for (++i; i != 16; ++i)
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
|
|
return -1;
|
|
} else
|
|
return -1;
|
|
|
|
if (isLE)
|
|
ShiftAmt = 16 - ShiftAmt;
|
|
|
|
return ShiftAmt;
|
|
}
|
|
|
|
/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
|
|
/// specifies a splat of a single element that is suitable for input to
|
|
/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
|
|
bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
|
|
assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
|
|
EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
|
|
|
|
// The consecutive indices need to specify an element, not part of two
|
|
// different elements. So abandon ship early if this isn't the case.
|
|
if (N->getMaskElt(0) % EltSize != 0)
|
|
return false;
|
|
|
|
// This is a splat operation if each element of the permute is the same, and
|
|
// if the value doesn't reference the second vector.
|
|
unsigned ElementBase = N->getMaskElt(0);
|
|
|
|
// FIXME: Handle UNDEF elements too!
|
|
if (ElementBase >= 16)
|
|
return false;
|
|
|
|
// Check that the indices are consecutive, in the case of a multi-byte element
|
|
// splatted with a v16i8 mask.
|
|
for (unsigned i = 1; i != EltSize; ++i)
|
|
if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
|
|
return false;
|
|
|
|
for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
|
|
if (N->getMaskElt(i) < 0) continue;
|
|
for (unsigned j = 0; j != EltSize; ++j)
|
|
if (N->getMaskElt(i+j) != N->getMaskElt(j))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// Check that the mask is shuffling N byte elements. Within each N byte
|
|
/// element of the mask, the indices could be either in increasing or
|
|
/// decreasing order as long as they are consecutive.
|
|
/// \param[in] N the shuffle vector SD Node to analyze
|
|
/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
|
|
/// Word/DoubleWord/QuadWord).
|
|
/// \param[in] StepLen the delta indices number among the N byte element, if
|
|
/// the mask is in increasing/decreasing order then it is 1/-1.
|
|
/// \return true iff the mask is shuffling N byte elements.
|
|
static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
|
|
int StepLen) {
|
|
assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
|
|
"Unexpected element width.");
|
|
assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
|
|
|
|
unsigned NumOfElem = 16 / Width;
|
|
unsigned MaskVal[16]; // Width is never greater than 16
|
|
for (unsigned i = 0; i < NumOfElem; ++i) {
|
|
MaskVal[0] = N->getMaskElt(i * Width);
|
|
if ((StepLen == 1) && (MaskVal[0] % Width)) {
|
|
return false;
|
|
} else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
|
|
return false;
|
|
}
|
|
|
|
for (unsigned int j = 1; j < Width; ++j) {
|
|
MaskVal[j] = N->getMaskElt(i * Width + j);
|
|
if (MaskVal[j] != MaskVal[j-1] + StepLen) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
|
|
unsigned &InsertAtByte, bool &Swap, bool IsLE) {
|
|
if (!isNByteElemShuffleMask(N, 4, 1))
|
|
return false;
|
|
|
|
// Now we look at mask elements 0,4,8,12
|
|
unsigned M0 = N->getMaskElt(0) / 4;
|
|
unsigned M1 = N->getMaskElt(4) / 4;
|
|
unsigned M2 = N->getMaskElt(8) / 4;
|
|
unsigned M3 = N->getMaskElt(12) / 4;
|
|
unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
|
|
unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
|
|
|
|
// Below, let H and L be arbitrary elements of the shuffle mask
|
|
// where H is in the range [4,7] and L is in the range [0,3].
|
|
// H, 1, 2, 3 or L, 5, 6, 7
|
|
if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
|
|
(M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
|
|
ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
|
|
InsertAtByte = IsLE ? 12 : 0;
|
|
Swap = M0 < 4;
|
|
return true;
|
|
}
|
|
// 0, H, 2, 3 or 4, L, 6, 7
|
|
if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
|
|
(M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
|
|
ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
|
|
InsertAtByte = IsLE ? 8 : 4;
|
|
Swap = M1 < 4;
|
|
return true;
|
|
}
|
|
// 0, 1, H, 3 or 4, 5, L, 7
|
|
if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
|
|
(M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
|
|
ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
|
|
InsertAtByte = IsLE ? 4 : 8;
|
|
Swap = M2 < 4;
|
|
return true;
|
|
}
|
|
// 0, 1, 2, H or 4, 5, 6, L
|
|
if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
|
|
(M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
|
|
ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
|
|
InsertAtByte = IsLE ? 0 : 12;
|
|
Swap = M3 < 4;
|
|
return true;
|
|
}
|
|
|
|
// If both vector operands for the shuffle are the same vector, the mask will
|
|
// contain only elements from the first one and the second one will be undef.
|
|
if (N->getOperand(1).isUndef()) {
|
|
ShiftElts = 0;
|
|
Swap = true;
|
|
unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
|
|
if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
|
|
InsertAtByte = IsLE ? 12 : 0;
|
|
return true;
|
|
}
|
|
if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
|
|
InsertAtByte = IsLE ? 8 : 4;
|
|
return true;
|
|
}
|
|
if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
|
|
InsertAtByte = IsLE ? 4 : 8;
|
|
return true;
|
|
}
|
|
if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
|
|
InsertAtByte = IsLE ? 0 : 12;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
|
|
bool &Swap, bool IsLE) {
|
|
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
|
|
// Ensure each byte index of the word is consecutive.
|
|
if (!isNByteElemShuffleMask(N, 4, 1))
|
|
return false;
|
|
|
|
// Now we look at mask elements 0,4,8,12, which are the beginning of words.
|
|
unsigned M0 = N->getMaskElt(0) / 4;
|
|
unsigned M1 = N->getMaskElt(4) / 4;
|
|
unsigned M2 = N->getMaskElt(8) / 4;
|
|
unsigned M3 = N->getMaskElt(12) / 4;
|
|
|
|
// If both vector operands for the shuffle are the same vector, the mask will
|
|
// contain only elements from the first one and the second one will be undef.
|
|
if (N->getOperand(1).isUndef()) {
|
|
assert(M0 < 4 && "Indexing into an undef vector?");
|
|
if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
|
|
return false;
|
|
|
|
ShiftElts = IsLE ? (4 - M0) % 4 : M0;
|
|
Swap = false;
|
|
return true;
|
|
}
|
|
|
|
// Ensure each word index of the ShuffleVector Mask is consecutive.
|
|
if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
|
|
return false;
|
|
|
|
if (IsLE) {
|
|
if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
|
|
// Input vectors don't need to be swapped if the leading element
|
|
// of the result is one of the 3 left elements of the second vector
|
|
// (or if there is no shift to be done at all).
|
|
Swap = false;
|
|
ShiftElts = (8 - M0) % 8;
|
|
} else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
|
|
// Input vectors need to be swapped if the leading element
|
|
// of the result is one of the 3 left elements of the first vector
|
|
// (or if we're shifting by 4 - thereby simply swapping the vectors).
|
|
Swap = true;
|
|
ShiftElts = (4 - M0) % 4;
|
|
}
|
|
|
|
return true;
|
|
} else { // BE
|
|
if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
|
|
// Input vectors don't need to be swapped if the leading element
|
|
// of the result is one of the 4 elements of the first vector.
|
|
Swap = false;
|
|
ShiftElts = M0;
|
|
} else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
|
|
// Input vectors need to be swapped if the leading element
|
|
// of the result is one of the 4 elements of the right vector.
|
|
Swap = true;
|
|
ShiftElts = M0 - 4;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
|
|
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
|
|
|
|
if (!isNByteElemShuffleMask(N, Width, -1))
|
|
return false;
|
|
|
|
for (int i = 0; i < 16; i += Width)
|
|
if (N->getMaskElt(i) != i + Width - 1)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
|
|
return isXXBRShuffleMaskHelper(N, 2);
|
|
}
|
|
|
|
bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
|
|
return isXXBRShuffleMaskHelper(N, 4);
|
|
}
|
|
|
|
bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
|
|
return isXXBRShuffleMaskHelper(N, 8);
|
|
}
|
|
|
|
bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
|
|
return isXXBRShuffleMaskHelper(N, 16);
|
|
}
|
|
|
|
/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
|
|
/// if the inputs to the instruction should be swapped and set \p DM to the
|
|
/// value for the immediate.
|
|
/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
|
|
/// AND element 0 of the result comes from the first input (LE) or second input
|
|
/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
|
|
/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
|
|
/// mask.
|
|
bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
|
|
bool &Swap, bool IsLE) {
|
|
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
|
|
|
|
// Ensure each byte index of the double word is consecutive.
|
|
if (!isNByteElemShuffleMask(N, 8, 1))
|
|
return false;
|
|
|
|
unsigned M0 = N->getMaskElt(0) / 8;
|
|
unsigned M1 = N->getMaskElt(8) / 8;
|
|
assert(((M0 | M1) < 4) && "A mask element out of bounds?");
|
|
|
|
// If both vector operands for the shuffle are the same vector, the mask will
|
|
// contain only elements from the first one and the second one will be undef.
|
|
if (N->getOperand(1).isUndef()) {
|
|
if ((M0 | M1) < 2) {
|
|
DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
|
|
Swap = false;
|
|
return true;
|
|
} else
|
|
return false;
|
|
}
|
|
|
|
if (IsLE) {
|
|
if (M0 > 1 && M1 < 2) {
|
|
Swap = false;
|
|
} else if (M0 < 2 && M1 > 1) {
|
|
M0 = (M0 + 2) % 4;
|
|
M1 = (M1 + 2) % 4;
|
|
Swap = true;
|
|
} else
|
|
return false;
|
|
|
|
// Note: if control flow comes here that means Swap is already set above
|
|
DM = (((~M1) & 1) << 1) + ((~M0) & 1);
|
|
return true;
|
|
} else { // BE
|
|
if (M0 < 2 && M1 > 1) {
|
|
Swap = false;
|
|
} else if (M0 > 1 && M1 < 2) {
|
|
M0 = (M0 + 2) % 4;
|
|
M1 = (M1 + 2) % 4;
|
|
Swap = true;
|
|
} else
|
|
return false;
|
|
|
|
// Note: if control flow comes here that means Swap is already set above
|
|
DM = (M0 << 1) + (M1 & 1);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
|
|
/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
|
|
/// appropriate for PPC mnemonics (which have a big endian bias - namely
|
|
/// elements are counted from the left of the vector register).
|
|
unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
|
|
SelectionDAG &DAG) {
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
|
|
assert(isSplatShuffleMask(SVOp, EltSize));
|
|
if (DAG.getDataLayout().isLittleEndian())
|
|
return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
|
|
else
|
|
return SVOp->getMaskElt(0) / EltSize;
|
|
}
|
|
|
|
/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
|
|
/// by using a vspltis[bhw] instruction of the specified element size, return
|
|
/// the constant being splatted. The ByteSize field indicates the number of
|
|
/// bytes of each element [124] -> [bhw].
|
|
SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
|
|
SDValue OpVal(nullptr, 0);
|
|
|
|
// If ByteSize of the splat is bigger than the element size of the
|
|
// build_vector, then we have a case where we are checking for a splat where
|
|
// multiple elements of the buildvector are folded together into a single
|
|
// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
|
|
unsigned EltSize = 16/N->getNumOperands();
|
|
if (EltSize < ByteSize) {
|
|
unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
|
|
SDValue UniquedVals[4];
|
|
assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
|
|
|
|
// See if all of the elements in the buildvector agree across.
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
|
if (N->getOperand(i).isUndef()) continue;
|
|
// If the element isn't a constant, bail fully out.
|
|
if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
|
|
|
|
if (!UniquedVals[i&(Multiple-1)].getNode())
|
|
UniquedVals[i&(Multiple-1)] = N->getOperand(i);
|
|
else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
|
|
return SDValue(); // no match.
|
|
}
|
|
|
|
// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
|
|
// either constant or undef values that are identical for each chunk. See
|
|
// if these chunks can form into a larger vspltis*.
|
|
|
|
// Check to see if all of the leading entries are either 0 or -1. If
|
|
// neither, then this won't fit into the immediate field.
|
|
bool LeadingZero = true;
|
|
bool LeadingOnes = true;
|
|
for (unsigned i = 0; i != Multiple-1; ++i) {
|
|
if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
|
|
|
|
LeadingZero &= isNullConstant(UniquedVals[i]);
|
|
LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
|
|
}
|
|
// Finally, check the least significant entry.
|
|
if (LeadingZero) {
|
|
if (!UniquedVals[Multiple-1].getNode())
|
|
return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
|
|
int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
|
|
if (Val < 16) // 0,0,0,4 -> vspltisw(4)
|
|
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
|
|
}
|
|
if (LeadingOnes) {
|
|
if (!UniquedVals[Multiple-1].getNode())
|
|
return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
|
|
int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
|
|
if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
|
|
return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// Check to see if this buildvec has a single non-undef value in its elements.
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
|
if (N->getOperand(i).isUndef()) continue;
|
|
if (!OpVal.getNode())
|
|
OpVal = N->getOperand(i);
|
|
else if (OpVal != N->getOperand(i))
|
|
return SDValue();
|
|
}
|
|
|
|
if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
|
|
|
|
unsigned ValSizeInBytes = EltSize;
|
|
uint64_t Value = 0;
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
|
|
Value = CN->getZExtValue();
|
|
} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
|
|
assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
|
|
Value = FloatToBits(CN->getValueAPF().convertToFloat());
|
|
}
|
|
|
|
// If the splat value is larger than the element value, then we can never do
|
|
// this splat. The only case that we could fit the replicated bits into our
|
|
// immediate field for would be zero, and we prefer to use vxor for it.
|
|
if (ValSizeInBytes < ByteSize) return SDValue();
|
|
|
|
// If the element value is larger than the splat value, check if it consists
|
|
// of a repeated bit pattern of size ByteSize.
|
|
if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
|
|
return SDValue();
|
|
|
|
// Properly sign extend the value.
|
|
int MaskVal = SignExtend32(Value, ByteSize * 8);
|
|
|
|
// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
|
|
if (MaskVal == 0) return SDValue();
|
|
|
|
// Finally, if this value fits in a 5 bit sext field, return it
|
|
if (SignExtend32<5>(MaskVal) == MaskVal)
|
|
return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
|
|
return SDValue();
|
|
}
|
|
|
|
/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift
|
|
/// amount, otherwise return -1.
|
|
int PPC::isQVALIGNIShuffleMask(SDNode *N) {
|
|
EVT VT = N->getValueType(0);
|
|
if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1)
|
|
return -1;
|
|
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
|
|
|
|
// Find the first non-undef value in the shuffle mask.
|
|
unsigned i;
|
|
for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i)
|
|
/*search*/;
|
|
|
|
if (i == 4) return -1; // all undef.
|
|
|
|
// Otherwise, check to see if the rest of the elements are consecutively
|
|
// numbered from this value.
|
|
unsigned ShiftAmt = SVOp->getMaskElt(i);
|
|
if (ShiftAmt < i) return -1;
|
|
ShiftAmt -= i;
|
|
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
for (++i; i != 4; ++i)
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
|
|
return -1;
|
|
|
|
return ShiftAmt;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Addressing Mode Selection
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
|
|
/// or 64-bit immediate, and if the value can be accurately represented as a
|
|
/// sign extension from a 16-bit value. If so, this returns true and the
|
|
/// immediate.
|
|
bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
|
|
if (!isa<ConstantSDNode>(N))
|
|
return false;
|
|
|
|
Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
|
|
if (N->getValueType(0) == MVT::i32)
|
|
return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
|
|
else
|
|
return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
|
|
}
|
|
bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
|
|
return isIntS16Immediate(Op.getNode(), Imm);
|
|
}
|
|
|
|
|
|
/// SelectAddressEVXRegReg - Given the specified address, check to see if it can
|
|
/// be represented as an indexed [r+r] operation.
|
|
bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
|
|
SDValue &Index,
|
|
SelectionDAG &DAG) const {
|
|
for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
|
|
UI != E; ++UI) {
|
|
if (MemSDNode *Memop = dyn_cast<MemSDNode>(*UI)) {
|
|
if (Memop->getMemoryVT() == MVT::f64) {
|
|
Base = N.getOperand(0);
|
|
Index = N.getOperand(1);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// SelectAddressRegReg - Given the specified addressed, check to see if it
|
|
/// can be represented as an indexed [r+r] operation. Returns false if it
|
|
/// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
|
|
/// non-zero and N can be represented by a base register plus a signed 16-bit
|
|
/// displacement, make a more precise judgement by checking (displacement % \p
|
|
/// EncodingAlignment).
|
|
bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
|
|
SDValue &Index, SelectionDAG &DAG,
|
|
unsigned EncodingAlignment) const {
|
|
int16_t imm = 0;
|
|
if (N.getOpcode() == ISD::ADD) {
|
|
// Is there any SPE load/store (f64), which can't handle 16bit offset?
|
|
// SPE load/store can only handle 8-bit offsets.
|
|
if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
|
|
return true;
|
|
if (isIntS16Immediate(N.getOperand(1), imm) &&
|
|
(!EncodingAlignment || !(imm % EncodingAlignment)))
|
|
return false; // r+i
|
|
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
|
|
return false; // r+i
|
|
|
|
Base = N.getOperand(0);
|
|
Index = N.getOperand(1);
|
|
return true;
|
|
} else if (N.getOpcode() == ISD::OR) {
|
|
if (isIntS16Immediate(N.getOperand(1), imm) &&
|
|
(!EncodingAlignment || !(imm % EncodingAlignment)))
|
|
return false; // r+i can fold it if we can.
|
|
|
|
// If this is an or of disjoint bitfields, we can codegen this as an add
|
|
// (for better address arithmetic) if the LHS and RHS of the OR are provably
|
|
// disjoint.
|
|
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
|
|
|
|
if (LHSKnown.Zero.getBoolValue()) {
|
|
KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
|
|
// If all of the bits are known zero on the LHS or RHS, the add won't
|
|
// carry.
|
|
if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
|
|
Base = N.getOperand(0);
|
|
Index = N.getOperand(1);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// If we happen to be doing an i64 load or store into a stack slot that has
|
|
// less than a 4-byte alignment, then the frame-index elimination may need to
|
|
// use an indexed load or store instruction (because the offset may not be a
|
|
// multiple of 4). The extra register needed to hold the offset comes from the
|
|
// register scavenger, and it is possible that the scavenger will need to use
|
|
// an emergency spill slot. As a result, we need to make sure that a spill slot
|
|
// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
|
|
// stack slot.
|
|
static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
|
|
// FIXME: This does not handle the LWA case.
|
|
if (VT != MVT::i64)
|
|
return;
|
|
|
|
// NOTE: We'll exclude negative FIs here, which come from argument
|
|
// lowering, because there are no known test cases triggering this problem
|
|
// using packed structures (or similar). We can remove this exclusion if
|
|
// we find such a test case. The reason why this is so test-case driven is
|
|
// because this entire 'fixup' is only to prevent crashes (from the
|
|
// register scavenger) on not-really-valid inputs. For example, if we have:
|
|
// %a = alloca i1
|
|
// %b = bitcast i1* %a to i64*
|
|
// store i64* a, i64 b
|
|
// then the store should really be marked as 'align 1', but is not. If it
|
|
// were marked as 'align 1' then the indexed form would have been
|
|
// instruction-selected initially, and the problem this 'fixup' is preventing
|
|
// won't happen regardless.
|
|
if (FrameIdx < 0)
|
|
return;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
unsigned Align = MFI.getObjectAlignment(FrameIdx);
|
|
if (Align >= 4)
|
|
return;
|
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
FuncInfo->setHasNonRISpills();
|
|
}
|
|
|
|
/// Returns true if the address N can be represented by a base register plus
|
|
/// a signed 16-bit displacement [r+imm], and if it is not better
|
|
/// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
|
|
/// displacements that are multiples of that value.
|
|
bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
|
|
SDValue &Base,
|
|
SelectionDAG &DAG,
|
|
unsigned EncodingAlignment) const {
|
|
// FIXME dl should come from parent load or store, not from address
|
|
SDLoc dl(N);
|
|
// If this can be more profitably realized as r+r, fail.
|
|
if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
|
|
return false;
|
|
|
|
if (N.getOpcode() == ISD::ADD) {
|
|
int16_t imm = 0;
|
|
if (isIntS16Immediate(N.getOperand(1), imm) &&
|
|
(!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
|
|
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
|
|
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
|
|
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
|
|
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
|
|
} else {
|
|
Base = N.getOperand(0);
|
|
}
|
|
return true; // [r+i]
|
|
} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
|
|
// Match LOAD (ADD (X, Lo(G))).
|
|
assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
|
|
&& "Cannot handle constant offsets yet!");
|
|
Disp = N.getOperand(1).getOperand(0); // The global address.
|
|
assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
|
|
Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
|
|
Disp.getOpcode() == ISD::TargetConstantPool ||
|
|
Disp.getOpcode() == ISD::TargetJumpTable);
|
|
Base = N.getOperand(0);
|
|
return true; // [&g+r]
|
|
}
|
|
} else if (N.getOpcode() == ISD::OR) {
|
|
int16_t imm = 0;
|
|
if (isIntS16Immediate(N.getOperand(1), imm) &&
|
|
(!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
|
|
// If this is an or of disjoint bitfields, we can codegen this as an add
|
|
// (for better address arithmetic) if the LHS and RHS of the OR are
|
|
// provably disjoint.
|
|
KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
|
|
|
|
if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
|
|
// If all of the bits are known zero on the LHS or RHS, the add won't
|
|
// carry.
|
|
if (FrameIndexSDNode *FI =
|
|
dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
|
|
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
|
|
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
|
|
} else {
|
|
Base = N.getOperand(0);
|
|
}
|
|
Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
|
|
return true;
|
|
}
|
|
}
|
|
} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
|
|
// Loading from a constant address.
|
|
|
|
// If this address fits entirely in a 16-bit sext immediate field, codegen
|
|
// this as "d, 0"
|
|
int16_t Imm;
|
|
if (isIntS16Immediate(CN, Imm) &&
|
|
(!EncodingAlignment || (Imm % EncodingAlignment) == 0)) {
|
|
Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
|
|
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
|
|
CN->getValueType(0));
|
|
return true;
|
|
}
|
|
|
|
// Handle 32-bit sext immediates with LIS + addr mode.
|
|
if ((CN->getValueType(0) == MVT::i32 ||
|
|
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
|
|
(!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) {
|
|
int Addr = (int)CN->getZExtValue();
|
|
|
|
// Otherwise, break this down into an LIS + disp.
|
|
Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
|
|
|
|
Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
|
|
MVT::i32);
|
|
unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
|
|
Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
|
|
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
|
|
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
|
|
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
|
|
} else
|
|
Base = N;
|
|
return true; // [r+0]
|
|
}
|
|
|
|
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
|
|
/// represented as an indexed [r+r] operation.
|
|
bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
|
|
SDValue &Index,
|
|
SelectionDAG &DAG) const {
|
|
// Check to see if we can easily represent this as an [r+r] address. This
|
|
// will fail if it thinks that the address is more profitably represented as
|
|
// reg+imm, e.g. where imm = 0.
|
|
if (SelectAddressRegReg(N, Base, Index, DAG))
|
|
return true;
|
|
|
|
// If the address is the result of an add, we will utilize the fact that the
|
|
// address calculation includes an implicit add. However, we can reduce
|
|
// register pressure if we do not materialize a constant just for use as the
|
|
// index register. We only get rid of the add if it is not an add of a
|
|
// value and a 16-bit signed constant and both have a single use.
|
|
int16_t imm = 0;
|
|
if (N.getOpcode() == ISD::ADD &&
|
|
(!isIntS16Immediate(N.getOperand(1), imm) ||
|
|
!N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
|
|
Base = N.getOperand(0);
|
|
Index = N.getOperand(1);
|
|
return true;
|
|
}
|
|
|
|
// Otherwise, do it the hard way, using R0 as the base register.
|
|
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
|
|
N.getValueType());
|
|
Index = N;
|
|
return true;
|
|
}
|
|
|
|
/// Returns true if we should use a direct load into vector instruction
|
|
/// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
|
|
static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
|
|
|
|
// If there are any other uses other than scalar to vector, then we should
|
|
// keep it as a scalar load -> direct move pattern to prevent multiple
|
|
// loads.
|
|
LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
|
|
if (!LD)
|
|
return false;
|
|
|
|
EVT MemVT = LD->getMemoryVT();
|
|
if (!MemVT.isSimple())
|
|
return false;
|
|
switch(MemVT.getSimpleVT().SimpleTy) {
|
|
case MVT::i64:
|
|
break;
|
|
case MVT::i32:
|
|
if (!ST.hasP8Vector())
|
|
return false;
|
|
break;
|
|
case MVT::i16:
|
|
case MVT::i8:
|
|
if (!ST.hasP9Vector())
|
|
return false;
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
SDValue LoadedVal(N, 0);
|
|
if (!LoadedVal.hasOneUse())
|
|
return false;
|
|
|
|
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
|
|
UI != UE; ++UI)
|
|
if (UI.getUse().get().getResNo() == 0 &&
|
|
UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/// getPreIndexedAddressParts - returns true by value, base pointer and
|
|
/// offset pointer and addressing mode by reference if the node's address
|
|
/// can be legally represented as pre-indexed load / store address.
|
|
bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
|
|
SDValue &Offset,
|
|
ISD::MemIndexedMode &AM,
|
|
SelectionDAG &DAG) const {
|
|
if (DisablePPCPreinc) return false;
|
|
|
|
bool isLoad = true;
|
|
SDValue Ptr;
|
|
EVT VT;
|
|
unsigned Alignment;
|
|
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
|
|
Ptr = LD->getBasePtr();
|
|
VT = LD->getMemoryVT();
|
|
Alignment = LD->getAlignment();
|
|
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
|
|
Ptr = ST->getBasePtr();
|
|
VT = ST->getMemoryVT();
|
|
Alignment = ST->getAlignment();
|
|
isLoad = false;
|
|
} else
|
|
return false;
|
|
|
|
// Do not generate pre-inc forms for specific loads that feed scalar_to_vector
|
|
// instructions because we can fold these into a more efficient instruction
|
|
// instead, (such as LXSD).
|
|
if (isLoad && usePartialVectorLoads(N, Subtarget)) {
|
|
return false;
|
|
}
|
|
|
|
// PowerPC doesn't have preinc load/store instructions for vectors (except
|
|
// for QPX, which does have preinc r+r forms).
|
|
if (VT.isVector()) {
|
|
if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) {
|
|
return false;
|
|
} else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) {
|
|
AM = ISD::PRE_INC;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
|
|
// Common code will reject creating a pre-inc form if the base pointer
|
|
// is a frame index, or if N is a store and the base pointer is either
|
|
// the same as or a predecessor of the value being stored. Check for
|
|
// those situations here, and try with swapped Base/Offset instead.
|
|
bool Swap = false;
|
|
|
|
if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
|
|
Swap = true;
|
|
else if (!isLoad) {
|
|
SDValue Val = cast<StoreSDNode>(N)->getValue();
|
|
if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
|
|
Swap = true;
|
|
}
|
|
|
|
if (Swap)
|
|
std::swap(Base, Offset);
|
|
|
|
AM = ISD::PRE_INC;
|
|
return true;
|
|
}
|
|
|
|
// LDU/STU can only handle immediates that are a multiple of 4.
|
|
if (VT != MVT::i64) {
|
|
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
|
|
return false;
|
|
} else {
|
|
// LDU/STU need an address with at least 4-byte alignment.
|
|
if (Alignment < 4)
|
|
return false;
|
|
|
|
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
|
|
return false;
|
|
}
|
|
|
|
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
|
|
// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
|
|
// sext i32 to i64 when addr mode is r+i.
|
|
if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
|
|
LD->getExtensionType() == ISD::SEXTLOAD &&
|
|
isa<ConstantSDNode>(Offset))
|
|
return false;
|
|
}
|
|
|
|
AM = ISD::PRE_INC;
|
|
return true;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// LowerOperation implementation
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// Return true if we should reference labels using a PICBase, set the HiOpFlags
|
|
/// and LoOpFlags to the target MO flags.
|
|
static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
|
|
unsigned &HiOpFlags, unsigned &LoOpFlags,
|
|
const GlobalValue *GV = nullptr) {
|
|
HiOpFlags = PPCII::MO_HA;
|
|
LoOpFlags = PPCII::MO_LO;
|
|
|
|
// Don't use the pic base if not in PIC relocation model.
|
|
if (IsPIC) {
|
|
HiOpFlags |= PPCII::MO_PIC_FLAG;
|
|
LoOpFlags |= PPCII::MO_PIC_FLAG;
|
|
}
|
|
|
|
// If this is a reference to a global value that requires a non-lazy-ptr, make
|
|
// sure that instruction lowering adds it.
|
|
if (GV && Subtarget.hasLazyResolverStub(GV)) {
|
|
HiOpFlags |= PPCII::MO_NLP_FLAG;
|
|
LoOpFlags |= PPCII::MO_NLP_FLAG;
|
|
|
|
if (GV->hasHiddenVisibility()) {
|
|
HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
|
|
LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
|
|
}
|
|
}
|
|
}
|
|
|
|
static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
|
|
SelectionDAG &DAG) {
|
|
SDLoc DL(HiPart);
|
|
EVT PtrVT = HiPart.getValueType();
|
|
SDValue Zero = DAG.getConstant(0, DL, PtrVT);
|
|
|
|
SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
|
|
SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
|
|
|
|
// With PIC, the first instruction is actually "GR+hi(&G)".
|
|
if (isPIC)
|
|
Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
|
|
DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
|
|
|
|
// Generate non-pic code that has direct accesses to the constant pool.
|
|
// The address of the global is just (hi(&g)+lo(&g)).
|
|
return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
|
|
}
|
|
|
|
static void setUsesTOCBasePtr(MachineFunction &MF) {
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
FuncInfo->setUsesTOCBasePtr();
|
|
}
|
|
|
|
static void setUsesTOCBasePtr(SelectionDAG &DAG) {
|
|
setUsesTOCBasePtr(DAG.getMachineFunction());
|
|
}
|
|
|
|
SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
|
|
SDValue GA) const {
|
|
const bool Is64Bit = Subtarget.isPPC64();
|
|
EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
|
|
SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
|
|
: Subtarget.isAIXABI()
|
|
? DAG.getRegister(PPC::R2, VT)
|
|
: DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
|
|
SDValue Ops[] = { GA, Reg };
|
|
return DAG.getMemIntrinsicNode(
|
|
PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
|
|
MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
|
|
MachineMemOperand::MOLoad);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
EVT PtrVT = Op.getValueType();
|
|
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
|
|
const Constant *C = CP->getConstVal();
|
|
|
|
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
|
|
// The actual address of the GlobalValue is stored in the TOC.
|
|
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
|
|
return getTOCEntry(DAG, SDLoc(CP), GA);
|
|
}
|
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
bool IsPIC = isPositionIndependent();
|
|
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
|
|
|
|
if (IsPIC && Subtarget.isSVR4ABI()) {
|
|
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
|
|
PPCII::MO_PIC_FLAG);
|
|
return getTOCEntry(DAG, SDLoc(CP), GA);
|
|
}
|
|
|
|
SDValue CPIHi =
|
|
DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
|
|
SDValue CPILo =
|
|
DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
|
|
return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
|
|
}
|
|
|
|
// For 64-bit PowerPC, prefer the more compact relative encodings.
|
|
// This trades 32 bits per jump table entry for one or two instructions
|
|
// on the jump site.
|
|
unsigned PPCTargetLowering::getJumpTableEncoding() const {
|
|
if (isJumpTableRelative())
|
|
return MachineJumpTableInfo::EK_LabelDifference32;
|
|
|
|
return TargetLowering::getJumpTableEncoding();
|
|
}
|
|
|
|
bool PPCTargetLowering::isJumpTableRelative() const {
|
|
if (UseAbsoluteJumpTables)
|
|
return false;
|
|
if (Subtarget.isPPC64() || Subtarget.isAIXABI())
|
|
return true;
|
|
return TargetLowering::isJumpTableRelative();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
|
|
SelectionDAG &DAG) const {
|
|
if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
|
|
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
|
|
|
|
switch (getTargetMachine().getCodeModel()) {
|
|
case CodeModel::Small:
|
|
case CodeModel::Medium:
|
|
return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
|
|
default:
|
|
return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
|
|
getPointerTy(DAG.getDataLayout()));
|
|
}
|
|
}
|
|
|
|
const MCExpr *
|
|
PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
|
|
unsigned JTI,
|
|
MCContext &Ctx) const {
|
|
if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
|
|
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
|
|
|
|
switch (getTargetMachine().getCodeModel()) {
|
|
case CodeModel::Small:
|
|
case CodeModel::Medium:
|
|
return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
|
|
default:
|
|
return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
|
|
}
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
|
|
EVT PtrVT = Op.getValueType();
|
|
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
|
|
|
|
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
|
|
// The actual address of the GlobalValue is stored in the TOC.
|
|
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
|
|
return getTOCEntry(DAG, SDLoc(JT), GA);
|
|
}
|
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
bool IsPIC = isPositionIndependent();
|
|
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
|
|
|
|
if (IsPIC && Subtarget.isSVR4ABI()) {
|
|
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
|
|
PPCII::MO_PIC_FLAG);
|
|
return getTOCEntry(DAG, SDLoc(GA), GA);
|
|
}
|
|
|
|
SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
|
|
SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
|
|
return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
EVT PtrVT = Op.getValueType();
|
|
BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
|
|
const BlockAddress *BA = BASDN->getBlockAddress();
|
|
|
|
// 64-bit SVR4 ABI and AIX ABI code are always position-independent.
|
|
// The actual BlockAddress is stored in the TOC.
|
|
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
|
|
return getTOCEntry(DAG, SDLoc(BASDN), GA);
|
|
}
|
|
|
|
// 32-bit position-independent ELF stores the BlockAddress in the .got.
|
|
if (Subtarget.is32BitELFABI() && isPositionIndependent())
|
|
return getTOCEntry(
|
|
DAG, SDLoc(BASDN),
|
|
DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));
|
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
bool IsPIC = isPositionIndependent();
|
|
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
|
|
SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
|
|
SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
|
|
return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
// FIXME: TLS addresses currently use medium model code sequences,
|
|
// which is the most useful form. Eventually support for small and
|
|
// large models could be added if users need it, at the cost of
|
|
// additional complexity.
|
|
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
|
|
if (DAG.getTarget().useEmulatedTLS())
|
|
return LowerToTLSEmulatedModel(GA, DAG);
|
|
|
|
SDLoc dl(GA);
|
|
const GlobalValue *GV = GA->getGlobal();
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
bool is64bit = Subtarget.isPPC64();
|
|
const Module *M = DAG.getMachineFunction().getFunction().getParent();
|
|
PICLevel::Level picLevel = M->getPICLevel();
|
|
|
|
const TargetMachine &TM = getTargetMachine();
|
|
TLSModel::Model Model = TM.getTLSModel(GV);
|
|
|
|
if (Model == TLSModel::LocalExec) {
|
|
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
|
|
PPCII::MO_TPREL_HA);
|
|
SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
|
|
PPCII::MO_TPREL_LO);
|
|
SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
|
|
: DAG.getRegister(PPC::R2, MVT::i32);
|
|
|
|
SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
|
|
return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
|
|
}
|
|
|
|
if (Model == TLSModel::InitialExec) {
|
|
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
|
|
SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
|
|
PPCII::MO_TLS);
|
|
SDValue GOTPtr;
|
|
if (is64bit) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
|
|
GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
|
|
PtrVT, GOTReg, TGA);
|
|
} else {
|
|
if (!TM.isPositionIndependent())
|
|
GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
|
|
else if (picLevel == PICLevel::SmallPIC)
|
|
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
|
|
else
|
|
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
|
|
}
|
|
SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
|
|
PtrVT, TGA, GOTPtr);
|
|
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
|
|
}
|
|
|
|
if (Model == TLSModel::GeneralDynamic) {
|
|
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
|
|
SDValue GOTPtr;
|
|
if (is64bit) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
|
|
GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
|
|
GOTReg, TGA);
|
|
} else {
|
|
if (picLevel == PICLevel::SmallPIC)
|
|
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
|
|
else
|
|
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
|
|
}
|
|
return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
|
|
GOTPtr, TGA, TGA);
|
|
}
|
|
|
|
if (Model == TLSModel::LocalDynamic) {
|
|
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
|
|
SDValue GOTPtr;
|
|
if (is64bit) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
|
|
GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
|
|
GOTReg, TGA);
|
|
} else {
|
|
if (picLevel == PICLevel::SmallPIC)
|
|
GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
|
|
else
|
|
GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
|
|
}
|
|
SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
|
|
PtrVT, GOTPtr, TGA, TGA);
|
|
SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
|
|
PtrVT, TLSAddr, TGA);
|
|
return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
|
|
}
|
|
|
|
llvm_unreachable("Unknown TLS model!");
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
EVT PtrVT = Op.getValueType();
|
|
GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
|
|
SDLoc DL(GSDN);
|
|
const GlobalValue *GV = GSDN->getGlobal();
|
|
|
|
// 64-bit SVR4 ABI & AIX ABI code is always position-independent.
|
|
// The actual address of the GlobalValue is stored in the TOC.
|
|
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
|
|
return getTOCEntry(DAG, DL, GA);
|
|
}
|
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
bool IsPIC = isPositionIndependent();
|
|
getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
|
|
|
|
if (IsPIC && Subtarget.isSVR4ABI()) {
|
|
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
|
|
GSDN->getOffset(),
|
|
PPCII::MO_PIC_FLAG);
|
|
return getTOCEntry(DAG, DL, GA);
|
|
}
|
|
|
|
SDValue GAHi =
|
|
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
|
|
SDValue GALo =
|
|
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
|
|
|
|
SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
|
|
|
|
// If the global reference is actually to a non-lazy-pointer, we have to do an
|
|
// extra load to get the address of the global.
|
|
if (MOHiFlag & PPCII::MO_NLP_FLAG)
|
|
Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
|
|
return Ptr;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
|
|
SDLoc dl(Op);
|
|
|
|
if (Op.getValueType() == MVT::v2i64) {
|
|
// When the operands themselves are v2i64 values, we need to do something
|
|
// special because VSX has no underlying comparison operations for these.
|
|
if (Op.getOperand(0).getValueType() == MVT::v2i64) {
|
|
// Equality can be handled by casting to the legal type for Altivec
|
|
// comparisons, everything else needs to be expanded.
|
|
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
|
|
DAG.getSetCC(dl, MVT::v4i32,
|
|
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
|
|
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
|
|
CC));
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// We handle most of these in the usual way.
|
|
return Op;
|
|
}
|
|
|
|
// If we're comparing for equality to zero, expose the fact that this is
|
|
// implemented as a ctlz/srl pair on ppc, so that the dag combiner can
|
|
// fold the new nodes.
|
|
if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
|
|
return V;
|
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
|
|
// Leave comparisons against 0 and -1 alone for now, since they're usually
|
|
// optimized. FIXME: revisit this when we can custom lower all setcc
|
|
// optimizations.
|
|
if (C->isAllOnesValue() || C->isNullValue())
|
|
return SDValue();
|
|
}
|
|
|
|
// If we have an integer seteq/setne, turn it into a compare against zero
|
|
// by xor'ing the rhs with the lhs, which is faster than setting a
|
|
// condition register, reading it back out, and masking the correct bit. The
|
|
// normal approach here uses sub to do this instead of xor. Using xor exposes
|
|
// the result to other bit-twiddling opportunities.
|
|
EVT LHSVT = Op.getOperand(0).getValueType();
|
|
if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
|
|
EVT VT = Op.getValueType();
|
|
SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
|
|
Op.getOperand(1));
|
|
return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
|
|
SDNode *Node = Op.getNode();
|
|
EVT VT = Node->getValueType(0);
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
SDValue InChain = Node->getOperand(0);
|
|
SDValue VAListPtr = Node->getOperand(1);
|
|
const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
|
|
SDLoc dl(Node);
|
|
|
|
assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
|
|
|
|
// gpr_index
|
|
SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
|
|
VAListPtr, MachinePointerInfo(SV), MVT::i8);
|
|
InChain = GprIndex.getValue(1);
|
|
|
|
if (VT == MVT::i64) {
|
|
// Check if GprIndex is even
|
|
SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
|
|
DAG.getConstant(1, dl, MVT::i32));
|
|
SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
|
|
DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
|
|
SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
|
|
DAG.getConstant(1, dl, MVT::i32));
|
|
// Align GprIndex to be even if it isn't
|
|
GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
|
|
GprIndex);
|
|
}
|
|
|
|
// fpr index is 1 byte after gpr
|
|
SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
|
|
DAG.getConstant(1, dl, MVT::i32));
|
|
|
|
// fpr
|
|
SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
|
|
FprPtr, MachinePointerInfo(SV), MVT::i8);
|
|
InChain = FprIndex.getValue(1);
|
|
|
|
SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
|
|
DAG.getConstant(8, dl, MVT::i32));
|
|
|
|
SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
|
|
DAG.getConstant(4, dl, MVT::i32));
|
|
|
|
// areas
|
|
SDValue OverflowArea =
|
|
DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
|
|
InChain = OverflowArea.getValue(1);
|
|
|
|
SDValue RegSaveArea =
|
|
DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
|
|
InChain = RegSaveArea.getValue(1);
|
|
|
|
// select overflow_area if index > 8
|
|
SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
|
|
DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
|
|
|
|
// adjustment constant gpr_index * 4/8
|
|
SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
|
|
VT.isInteger() ? GprIndex : FprIndex,
|
|
DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
|
|
MVT::i32));
|
|
|
|
// OurReg = RegSaveArea + RegConstant
|
|
SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
|
|
RegConstant);
|
|
|
|
// Floating types are 32 bytes into RegSaveArea
|
|
if (VT.isFloatingPoint())
|
|
OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
|
|
DAG.getConstant(32, dl, MVT::i32));
|
|
|
|
// increase {f,g}pr_index by 1 (or 2 if VT is i64)
|
|
SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
|
|
VT.isInteger() ? GprIndex : FprIndex,
|
|
DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
|
|
MVT::i32));
|
|
|
|
InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
|
|
VT.isInteger() ? VAListPtr : FprPtr,
|
|
MachinePointerInfo(SV), MVT::i8);
|
|
|
|
// determine if we should load from reg_save_area or overflow_area
|
|
SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
|
|
|
|
// increase overflow_area by 4/8 if gpr/fpr > 8
|
|
SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
|
|
DAG.getConstant(VT.isInteger() ? 4 : 8,
|
|
dl, MVT::i32));
|
|
|
|
OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
|
|
OverflowAreaPlusN);
|
|
|
|
InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
|
|
MachinePointerInfo(), MVT::i32);
|
|
|
|
return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
|
|
assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
|
|
|
|
// We have to copy the entire va_list struct:
|
|
// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
|
|
return DAG.getMemcpy(Op.getOperand(0), Op,
|
|
Op.getOperand(1), Op.getOperand(2),
|
|
DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
|
|
false, MachinePointerInfo(), MachinePointerInfo());
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
if (Subtarget.isAIXABI())
|
|
report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");
|
|
|
|
return Op.getOperand(0);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
if (Subtarget.isAIXABI())
|
|
report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");
|
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
SDValue Trmp = Op.getOperand(1); // trampoline
|
|
SDValue FPtr = Op.getOperand(2); // nested function
|
|
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
|
|
SDLoc dl(Op);
|
|
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
bool isPPC64 = (PtrVT == MVT::i64);
|
|
Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
|
|
|
|
TargetLowering::ArgListTy Args;
|
|
TargetLowering::ArgListEntry Entry;
|
|
|
|
Entry.Ty = IntPtrTy;
|
|
Entry.Node = Trmp; Args.push_back(Entry);
|
|
|
|
// TrampSize == (isPPC64 ? 48 : 40);
|
|
Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
|
Args.push_back(Entry);
|
|
|
|
Entry.Node = FPtr; Args.push_back(Entry);
|
|
Entry.Node = Nest; Args.push_back(Entry);
|
|
|
|
// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
|
|
TargetLowering::CallLoweringInfo CLI(DAG);
|
|
CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
|
|
CallingConv::C, Type::getVoidTy(*DAG.getContext()),
|
|
DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
|
|
|
|
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
|
|
return CallResult.second;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
|
|
SDLoc dl(Op);
|
|
|
|
if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
|
|
// vastart just stores the address of the VarArgsFrameIndex slot into the
|
|
// memory location argument.
|
|
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
|
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
|
|
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
|
|
MachinePointerInfo(SV));
|
|
}
|
|
|
|
// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
|
|
// We suppose the given va_list is already allocated.
|
|
//
|
|
// typedef struct {
|
|
// char gpr; /* index into the array of 8 GPRs
|
|
// * stored in the register save area
|
|
// * gpr=0 corresponds to r3,
|
|
// * gpr=1 to r4, etc.
|
|
// */
|
|
// char fpr; /* index into the array of 8 FPRs
|
|
// * stored in the register save area
|
|
// * fpr=0 corresponds to f1,
|
|
// * fpr=1 to f2, etc.
|
|
// */
|
|
// char *overflow_arg_area;
|
|
// /* location on stack that holds
|
|
// * the next overflow argument
|
|
// */
|
|
// char *reg_save_area;
|
|
// /* where r3:r10 and f1:f8 (if saved)
|
|
// * are stored
|
|
// */
|
|
// } va_list[1];
|
|
|
|
SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
|
|
SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
|
|
SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
|
|
PtrVT);
|
|
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
|
|
PtrVT);
|
|
|
|
uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
|
|
SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
|
|
|
|
uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
|
|
SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
|
|
|
|
uint64_t FPROffset = 1;
|
|
SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
|
|
|
|
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
|
|
|
|
// Store first byte : number of int regs
|
|
SDValue firstStore =
|
|
DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
|
|
MachinePointerInfo(SV), MVT::i8);
|
|
uint64_t nextOffset = FPROffset;
|
|
SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
|
|
ConstFPROffset);
|
|
|
|
// Store second byte : number of float regs
|
|
SDValue secondStore =
|
|
DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
|
|
MachinePointerInfo(SV, nextOffset), MVT::i8);
|
|
nextOffset += StackOffset;
|
|
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
|
|
|
|
// Store second word : arguments given on stack
|
|
SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
|
|
MachinePointerInfo(SV, nextOffset));
|
|
nextOffset += FrameOffset;
|
|
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
|
|
|
|
// Store third word : arguments given in registers
|
|
return DAG.getStore(thirdStore, dl, FR, nextPtr,
|
|
MachinePointerInfo(SV, nextOffset));
|
|
}
|
|
|
|
/// FPR - The set of FP registers that should be allocated for arguments
|
|
/// on Darwin and AIX.
|
|
static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
|
|
PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
|
|
PPC::F11, PPC::F12, PPC::F13};
|
|
|
|
/// QFPR - The set of QPX registers that should be allocated for arguments.
|
|
static const MCPhysReg QFPR[] = {
|
|
PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
|
|
PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13};
|
|
|
|
/// CalculateStackSlotSize - Calculates the size reserved for this argument on
|
|
/// the stack.
|
|
static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
|
|
unsigned PtrByteSize) {
|
|
unsigned ArgSize = ArgVT.getStoreSize();
|
|
if (Flags.isByVal())
|
|
ArgSize = Flags.getByValSize();
|
|
|
|
// Round up to multiples of the pointer size, except for array members,
|
|
// which are always packed.
|
|
if (!Flags.isInConsecutiveRegs())
|
|
ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
|
|
return ArgSize;
|
|
}
|
|
|
|
/// CalculateStackSlotAlignment - Calculates the alignment of this argument
|
|
/// on the stack.
|
|
static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
|
|
ISD::ArgFlagsTy Flags,
|
|
unsigned PtrByteSize) {
|
|
unsigned Align = PtrByteSize;
|
|
|
|
// Altivec parameters are padded to a 16 byte boundary.
|
|
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
|
|
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
|
|
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
|
|
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
|
|
Align = 16;
|
|
// QPX vector types stored in double-precision are padded to a 32 byte
|
|
// boundary.
|
|
else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
|
|
Align = 32;
|
|
|
|
// ByVal parameters are aligned as requested.
|
|
if (Flags.isByVal()) {
|
|
unsigned BVAlign = Flags.getByValAlign();
|
|
if (BVAlign > PtrByteSize) {
|
|
if (BVAlign % PtrByteSize != 0)
|
|
llvm_unreachable(
|
|
"ByVal alignment is not a multiple of the pointer size");
|
|
|
|
Align = BVAlign;
|
|
}
|
|
}
|
|
|
|
// Array members are always packed to their original alignment.
|
|
if (Flags.isInConsecutiveRegs()) {
|
|
// If the array member was split into multiple registers, the first
|
|
// needs to be aligned to the size of the full type. (Except for
|
|
// ppcf128, which is only aligned as its f64 components.)
|
|
if (Flags.isSplit() && OrigVT != MVT::ppcf128)
|
|
Align = OrigVT.getStoreSize();
|
|
else
|
|
Align = ArgVT.getStoreSize();
|
|
}
|
|
|
|
return Align;
|
|
}
|
|
|
|
/// CalculateStackSlotUsed - Return whether this argument will use its
|
|
/// stack slot (instead of being passed in registers). ArgOffset,
|
|
/// AvailableFPRs, and AvailableVRs must hold the current argument
|
|
/// position, and will be updated to account for this argument.
|
|
static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
|
|
ISD::ArgFlagsTy Flags,
|
|
unsigned PtrByteSize,
|
|
unsigned LinkageSize,
|
|
unsigned ParamAreaSize,
|
|
unsigned &ArgOffset,
|
|
unsigned &AvailableFPRs,
|
|
unsigned &AvailableVRs, bool HasQPX) {
|
|
bool UseMemory = false;
|
|
|
|
// Respect alignment of argument on the stack.
|
|
unsigned Align =
|
|
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
|
|
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
|
|
// If there's no space left in the argument save area, we must
|
|
// use memory (this check also catches zero-sized arguments).
|
|
if (ArgOffset >= LinkageSize + ParamAreaSize)
|
|
UseMemory = true;
|
|
|
|
// Allocate argument on the stack.
|
|
ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
|
|
if (Flags.isInConsecutiveRegsLast())
|
|
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
// If we overran the argument save area, we must use memory
|
|
// (this check catches arguments passed partially in memory)
|
|
if (ArgOffset > LinkageSize + ParamAreaSize)
|
|
UseMemory = true;
|
|
|
|
// However, if the argument is actually passed in an FPR or a VR,
|
|
// we don't use memory after all.
|
|
if (!Flags.isByVal()) {
|
|
if (ArgVT == MVT::f32 || ArgVT == MVT::f64 ||
|
|
// QPX registers overlap with the scalar FP registers.
|
|
(HasQPX && (ArgVT == MVT::v4f32 ||
|
|
ArgVT == MVT::v4f64 ||
|
|
ArgVT == MVT::v4i1)))
|
|
if (AvailableFPRs > 0) {
|
|
--AvailableFPRs;
|
|
return false;
|
|
}
|
|
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
|
|
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
|
|
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
|
|
ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
|
|
if (AvailableVRs > 0) {
|
|
--AvailableVRs;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return UseMemory;
|
|
}
|
|
|
|
/// EnsureStackAlignment - Round stack frame size up from NumBytes to
|
|
/// ensure minimum alignment required for target.
|
|
static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
|
|
unsigned NumBytes) {
|
|
unsigned TargetAlign = Lowering->getStackAlignment();
|
|
unsigned AlignMask = TargetAlign - 1;
|
|
NumBytes = (NumBytes + AlignMask) & ~AlignMask;
|
|
return NumBytes;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFormalArguments(
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
if (Subtarget.isAIXABI())
|
|
return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
|
|
InVals);
|
|
if (Subtarget.is64BitELFABI())
|
|
return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
|
|
InVals);
|
|
if (Subtarget.is32BitELFABI())
|
|
return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
|
|
InVals);
|
|
|
|
return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
|
|
InVals);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
// 32-bit SVR4 ABI Stack Frame Layout:
|
|
// +-----------------------------------+
|
|
// +--> | Back chain |
|
|
// | +-----------------------------------+
|
|
// | | Floating-point register save area |
|
|
// | +-----------------------------------+
|
|
// | | General register save area |
|
|
// | +-----------------------------------+
|
|
// | | CR save word |
|
|
// | +-----------------------------------+
|
|
// | | VRSAVE save word |
|
|
// | +-----------------------------------+
|
|
// | | Alignment padding |
|
|
// | +-----------------------------------+
|
|
// | | Vector register save area |
|
|
// | +-----------------------------------+
|
|
// | | Local variable space |
|
|
// | +-----------------------------------+
|
|
// | | Parameter list area |
|
|
// | +-----------------------------------+
|
|
// | | LR save word |
|
|
// | +-----------------------------------+
|
|
// SP--> +--- | Back chain |
|
|
// +-----------------------------------+
|
|
//
|
|
// Specifications:
|
|
// System V Application Binary Interface PowerPC Processor Supplement
|
|
// AltiVec Technology Programming Interface Manual
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
// Potential tail calls could cause overwriting of argument stack slots.
|
|
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
(CallConv == CallingConv::Fast));
|
|
unsigned PtrByteSize = 4;
|
|
|
|
// Assign locations to all of the incoming arguments.
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
|
|
*DAG.getContext());
|
|
|
|
// Reserve space for the linkage area on the stack.
|
|
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
CCInfo.AllocateStack(LinkageSize, PtrByteSize);
|
|
if (useSoftFloat())
|
|
CCInfo.PreAnalyzeFormalArguments(Ins);
|
|
|
|
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
|
|
CCInfo.clearWasPPCF128();
|
|
|
|
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
|
|
CCValAssign &VA = ArgLocs[i];
|
|
|
|
// Arguments stored in registers.
|
|
if (VA.isRegLoc()) {
|
|
const TargetRegisterClass *RC;
|
|
EVT ValVT = VA.getValVT();
|
|
|
|
switch (ValVT.getSimpleVT().SimpleTy) {
|
|
default:
|
|
llvm_unreachable("ValVT not supported by formal arguments Lowering");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
RC = &PPC::GPRCRegClass;
|
|
break;
|
|
case MVT::f32:
|
|
if (Subtarget.hasP8Vector())
|
|
RC = &PPC::VSSRCRegClass;
|
|
else if (Subtarget.hasSPE())
|
|
RC = &PPC::GPRCRegClass;
|
|
else
|
|
RC = &PPC::F4RCRegClass;
|
|
break;
|
|
case MVT::f64:
|
|
if (Subtarget.hasVSX())
|
|
RC = &PPC::VSFRCRegClass;
|
|
else if (Subtarget.hasSPE())
|
|
// SPE passes doubles in GPR pairs.
|
|
RC = &PPC::GPRCRegClass;
|
|
else
|
|
RC = &PPC::F8RCRegClass;
|
|
break;
|
|
case MVT::v16i8:
|
|
case MVT::v8i16:
|
|
case MVT::v4i32:
|
|
RC = &PPC::VRRCRegClass;
|
|
break;
|
|
case MVT::v4f32:
|
|
RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass;
|
|
break;
|
|
case MVT::v2f64:
|
|
case MVT::v2i64:
|
|
RC = &PPC::VRRCRegClass;
|
|
break;
|
|
case MVT::v4f64:
|
|
RC = &PPC::QFRCRegClass;
|
|
break;
|
|
case MVT::v4i1:
|
|
RC = &PPC::QBRCRegClass;
|
|
break;
|
|
}
|
|
|
|
SDValue ArgValue;
|
|
// Transform the arguments stored in physical registers into
|
|
// virtual ones.
|
|
if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
|
|
assert(i + 1 < e && "No second half of double precision argument");
|
|
unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
|
|
unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
|
|
SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
|
|
SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
|
|
if (!Subtarget.isLittleEndian())
|
|
std::swap (ArgValueLo, ArgValueHi);
|
|
ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
|
|
ArgValueHi);
|
|
} else {
|
|
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
|
|
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
|
|
ValVT == MVT::i1 ? MVT::i32 : ValVT);
|
|
if (ValVT == MVT::i1)
|
|
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
|
|
}
|
|
|
|
InVals.push_back(ArgValue);
|
|
} else {
|
|
// Argument stored in memory.
|
|
assert(VA.isMemLoc());
|
|
|
|
// Get the extended size of the argument type in stack
|
|
unsigned ArgSize = VA.getLocVT().getStoreSize();
|
|
// Get the actual size of the argument type
|
|
unsigned ObjSize = VA.getValVT().getStoreSize();
|
|
unsigned ArgOffset = VA.getLocMemOffset();
|
|
// Stack objects in PPC32 are right justified.
|
|
ArgOffset += ArgSize - ObjSize;
|
|
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);
|
|
|
|
// Create load nodes to retrieve arguments from the stack.
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
InVals.push_back(
|
|
DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
|
|
}
|
|
}
|
|
|
|
// Assign locations to all of the incoming aggregate by value arguments.
|
|
// Aggregates passed by value are stored in the local variable space of the
|
|
// caller's stack frame, right above the parameter list area.
|
|
SmallVector<CCValAssign, 16> ByValArgLocs;
|
|
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
|
ByValArgLocs, *DAG.getContext());
|
|
|
|
// Reserve stack space for the allocations in CCInfo.
|
|
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
|
|
|
|
CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
|
|
|
|
// Area that is at least reserved in the caller of this function.
|
|
unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
|
|
MinReservedArea = std::max(MinReservedArea, LinkageSize);
|
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
|
// call optimized function's reserved stack space needs to be aligned so that
|
|
// taking the difference between two stack areas will result in an aligned
|
|
// stack.
|
|
MinReservedArea =
|
|
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
|
|
|
SmallVector<SDValue, 8> MemOps;
|
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
if (isVarArg) {
|
|
static const MCPhysReg GPArgRegs[] = {
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
};
|
|
const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
|
|
|
|
static const MCPhysReg FPArgRegs[] = {
|
|
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
|
|
PPC::F8
|
|
};
|
|
unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
|
|
|
|
if (useSoftFloat() || hasSPE())
|
|
NumFPArgRegs = 0;
|
|
|
|
FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
|
|
FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
|
|
|
|
// Make room for NumGPArgRegs and NumFPArgRegs.
|
|
int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
|
|
NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
|
|
|
|
FuncInfo->setVarArgsStackOffset(
|
|
MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
|
|
CCInfo.getNextStackOffset(), true));
|
|
|
|
FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
|
|
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
|
|
|
// The fixed integer arguments of a variadic function are stored to the
|
|
// VarArgsFrameIndex on the stack so that they may be loaded by
|
|
// dereferencing the result of va_next.
|
|
for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
|
|
// Get an existing live-in vreg, or add a new one.
|
|
unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
|
|
if (!VReg)
|
|
VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
SDValue Store =
|
|
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
|
|
MemOps.push_back(Store);
|
|
// Increment the address by four for the next argument to store
|
|
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
}
|
|
|
|
// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
|
|
// is set.
|
|
// The double arguments are stored to the VarArgsFrameIndex
|
|
// on the stack.
|
|
for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
|
|
// Get an existing live-in vreg, or add a new one.
|
|
unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
|
|
if (!VReg)
|
|
VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
|
|
SDValue Store =
|
|
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
|
|
MemOps.push_back(Store);
|
|
// Increment the address by eight for the next argument to store
|
|
SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
|
|
PtrVT);
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
}
|
|
}
|
|
|
|
if (!MemOps.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
|
|
|
|
return Chain;
|
|
}
|
|
|
|
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
|
|
// value to MVT::i64 and then truncate to the correct register size.
|
|
SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
|
|
EVT ObjectVT, SelectionDAG &DAG,
|
|
SDValue ArgVal,
|
|
const SDLoc &dl) const {
|
|
if (Flags.isSExt())
|
|
ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
|
|
DAG.getValueType(ObjectVT));
|
|
else if (Flags.isZExt())
|
|
ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
|
|
DAG.getValueType(ObjectVT));
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
// TODO: add description of PPC stack frame format, or at least some docs.
|
|
//
|
|
bool isELFv2ABI = Subtarget.isELFv2ABI();
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
assert(!(CallConv == CallingConv::Fast && isVarArg) &&
|
|
"fastcc not supported on varargs functions");
|
|
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
// Potential tail calls could cause overwriting of argument stack slots.
|
|
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
(CallConv == CallingConv::Fast));
|
|
unsigned PtrByteSize = 8;
|
|
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
|
|
static const MCPhysReg GPR[] = {
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
};
|
|
static const MCPhysReg VR[] = {
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
};
|
|
|
|
const unsigned Num_GPR_Regs = array_lengthof(GPR);
|
|
const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
|
|
const unsigned Num_VR_Regs = array_lengthof(VR);
|
|
const unsigned Num_QFPR_Regs = Num_FPR_Regs;
|
|
|
|
// Do a first pass over the arguments to determine whether the ABI
|
|
// guarantees that our caller has allocated the parameter save area
|
|
// on its stack frame. In the ELFv1 ABI, this is always the case;
|
|
// in the ELFv2 ABI, it is true if this is a vararg function or if
|
|
// any parameter is located in a stack slot.
|
|
|
|
bool HasParameterArea = !isELFv2ABI || isVarArg;
|
|
unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
|
|
unsigned NumBytes = LinkageSize;
|
|
unsigned AvailableFPRs = Num_FPR_Regs;
|
|
unsigned AvailableVRs = Num_VR_Regs;
|
|
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
|
|
if (Ins[i].Flags.isNest())
|
|
continue;
|
|
|
|
if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
|
|
PtrByteSize, LinkageSize, ParamAreaSize,
|
|
NumBytes, AvailableFPRs, AvailableVRs,
|
|
Subtarget.hasQPX()))
|
|
HasParameterArea = true;
|
|
}
|
|
|
|
// Add DAG nodes to load the arguments or copy them out of registers. On
|
|
// entry to a function on PPC, the arguments start after the linkage area,
|
|
// although the first ones are often in registers.
|
|
|
|
unsigned ArgOffset = LinkageSize;
|
|
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
|
|
unsigned &QFPR_idx = FPR_idx;
|
|
SmallVector<SDValue, 8> MemOps;
|
|
Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
|
|
unsigned CurArgIdx = 0;
|
|
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
|
|
SDValue ArgVal;
|
|
bool needsLoad = false;
|
|
EVT ObjectVT = Ins[ArgNo].VT;
|
|
EVT OrigVT = Ins[ArgNo].ArgVT;
|
|
unsigned ObjSize = ObjectVT.getStoreSize();
|
|
unsigned ArgSize = ObjSize;
|
|
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
|
|
if (Ins[ArgNo].isOrigArg()) {
|
|
std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
|
|
CurArgIdx = Ins[ArgNo].getOrigArgIndex();
|
|
}
|
|
// We re-align the argument offset for each argument, except when using the
|
|
// fast calling convention, when we need to make sure we do that only when
|
|
// we'll actually use a stack slot.
|
|
unsigned CurArgOffset, Align;
|
|
auto ComputeArgOffset = [&]() {
|
|
/* Respect alignment of argument on the stack. */
|
|
Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
|
|
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
|
|
CurArgOffset = ArgOffset;
|
|
};
|
|
|
|
if (CallConv != CallingConv::Fast) {
|
|
ComputeArgOffset();
|
|
|
|
/* Compute GPR index associated with argument offset. */
|
|
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
|
|
GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
|
|
}
|
|
|
|
// FIXME the codegen can be much improved in some cases.
|
|
// We do not have to keep everything in memory.
|
|
if (Flags.isByVal()) {
|
|
assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
|
|
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputeArgOffset();
|
|
|
|
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
|
|
ObjSize = Flags.getByValSize();
|
|
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
// Empty aggregate parameters do not take up registers. Examples:
|
|
// struct { } a;
|
|
// union { } b;
|
|
// int c[0];
|
|
// etc. However, we have to provide a place-holder in InVals, so
|
|
// pretend we have an 8-byte item at the current address for that
|
|
// purpose.
|
|
if (!ObjSize) {
|
|
int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
InVals.push_back(FIN);
|
|
continue;
|
|
}
|
|
|
|
// Create a stack object covering all stack doublewords occupied
|
|
// by the argument. If the argument is (fully or partially) on
|
|
// the stack, or if the argument is fully in registers but the
|
|
// caller has allocated the parameter save anyway, we can refer
|
|
// directly to the caller's stack frame. Otherwise, create a
|
|
// local copy in our own frame.
|
|
int FI;
|
|
if (HasParameterArea ||
|
|
ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
|
|
FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
|
|
else
|
|
FI = MFI.CreateStackObject(ArgSize, Align, false);
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
// Handle aggregates smaller than 8 bytes.
|
|
if (ObjSize < PtrByteSize) {
|
|
// The value of the object is its address, which differs from the
|
|
// address of the enclosing doubleword on big-endian systems.
|
|
SDValue Arg = FIN;
|
|
if (!isLittleEndian) {
|
|
SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
|
|
Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
|
|
}
|
|
InVals.push_back(Arg);
|
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
|
|
FuncInfo->addLiveInAttr(VReg, Flags);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
SDValue Store;
|
|
|
|
if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
|
|
EVT ObjType = (ObjSize == 1 ? MVT::i8 :
|
|
(ObjSize == 2 ? MVT::i16 : MVT::i32));
|
|
Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
|
|
MachinePointerInfo(&*FuncArg), ObjType);
|
|
} else {
|
|
// For sizes that don't fit a truncating store (3, 5, 6, 7),
|
|
// store the whole register as-is to the parameter save area
|
|
// slot.
|
|
Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
|
MachinePointerInfo(&*FuncArg));
|
|
}
|
|
|
|
MemOps.push_back(Store);
|
|
}
|
|
// Whether we copied from a register or not, advance the offset
|
|
// into the parameter save area by a full doubleword.
|
|
ArgOffset += PtrByteSize;
|
|
continue;
|
|
}
|
|
|
|
// The value of the object is its address, which is the address of
|
|
// its first stack doubleword.
|
|
InVals.push_back(FIN);
|
|
|
|
// Store whatever pieces of the object are in registers to memory.
|
|
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
|
|
if (GPR_idx == Num_GPR_Regs)
|
|
break;
|
|
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
FuncInfo->addLiveInAttr(VReg, Flags);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
SDValue Addr = FIN;
|
|
if (j) {
|
|
SDValue Off = DAG.getConstant(j, dl, PtrVT);
|
|
Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
|
|
}
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
|
|
MachinePointerInfo(&*FuncArg, j));
|
|
MemOps.push_back(Store);
|
|
++GPR_idx;
|
|
}
|
|
ArgOffset += ArgSize;
|
|
continue;
|
|
}
|
|
|
|
switch (ObjectVT.getSimpleVT().SimpleTy) {
|
|
default: llvm_unreachable("Unhandled argument type!");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
case MVT::i64:
|
|
if (Flags.isNest()) {
|
|
// The 'nest' parameter, if any, is passed in R11.
|
|
unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
|
|
|
|
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
|
|
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
|
|
|
|
break;
|
|
}
|
|
|
|
// These can be scalar arguments or elements of an integer array type
|
|
// passed directly. Clang may use those instead of "byval" aggregate
|
|
// types to avoid forcing arguments to memory unnecessarily.
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
|
|
FuncInfo->addLiveInAttr(VReg, Flags);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
|
|
|
|
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
|
|
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
|
|
// value to MVT::i64 and then truncate to the correct register size.
|
|
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputeArgOffset();
|
|
|
|
needsLoad = true;
|
|
ArgSize = PtrByteSize;
|
|
}
|
|
if (CallConv != CallingConv::Fast || needsLoad)
|
|
ArgOffset += 8;
|
|
break;
|
|
|
|
case MVT::f32:
|
|
case MVT::f64:
|
|
// These can be scalar arguments or elements of a float array type
|
|
// passed directly. The latter are used to implement ELFv2 homogenous
|
|
// float aggregates.
|
|
if (FPR_idx != Num_FPR_Regs) {
|
|
unsigned VReg;
|
|
|
|
if (ObjectVT == MVT::f32)
|
|
VReg = MF.addLiveIn(FPR[FPR_idx],
|
|
Subtarget.hasP8Vector()
|
|
? &PPC::VSSRCRegClass
|
|
: &PPC::F4RCRegClass);
|
|
else
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
|
|
? &PPC::VSFRCRegClass
|
|
: &PPC::F8RCRegClass);
|
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
++FPR_idx;
|
|
} else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
|
|
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
|
|
// once we support fp <-> gpr moves.
|
|
|
|
// This can only ever happen in the presence of f32 array types,
|
|
// since otherwise we never run out of FPRs before running out
|
|
// of GPRs.
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
|
|
FuncInfo->addLiveInAttr(VReg, Flags);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
|
|
|
|
if (ObjectVT == MVT::f32) {
|
|
if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
|
|
ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
|
|
DAG.getConstant(32, dl, MVT::i32));
|
|
ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
|
|
}
|
|
|
|
ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputeArgOffset();
|
|
|
|
needsLoad = true;
|
|
}
|
|
|
|
// When passing an array of floats, the array occupies consecutive
|
|
// space in the argument area; only round up to the next doubleword
|
|
// at the end of the array. Otherwise, each float takes 8 bytes.
|
|
if (CallConv != CallingConv::Fast || needsLoad) {
|
|
ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
|
|
ArgOffset += ArgSize;
|
|
if (Flags.isInConsecutiveRegsLast())
|
|
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
}
|
|
break;
|
|
case MVT::v4f32:
|
|
case MVT::v4i32:
|
|
case MVT::v8i16:
|
|
case MVT::v16i8:
|
|
case MVT::v2f64:
|
|
case MVT::v2i64:
|
|
case MVT::v1i128:
|
|
case MVT::f128:
|
|
if (!Subtarget.hasQPX()) {
|
|
// These can be scalar arguments or elements of a vector array type
|
|
// passed directly. The latter are used to implement ELFv2 homogenous
|
|
// vector aggregates.
|
|
if (VR_idx != Num_VR_Regs) {
|
|
unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
++VR_idx;
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputeArgOffset();
|
|
needsLoad = true;
|
|
}
|
|
if (CallConv != CallingConv::Fast || needsLoad)
|
|
ArgOffset += 16;
|
|
break;
|
|
} // not QPX
|
|
|
|
assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
|
|
"Invalid QPX parameter type");
|
|
LLVM_FALLTHROUGH;
|
|
|
|
case MVT::v4f64:
|
|
case MVT::v4i1:
|
|
// QPX vectors are treated like their scalar floating-point subregisters
|
|
// (except that they're larger).
|
|
unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32;
|
|
if (QFPR_idx != Num_QFPR_Regs) {
|
|
const TargetRegisterClass *RC;
|
|
switch (ObjectVT.getSimpleVT().SimpleTy) {
|
|
case MVT::v4f64: RC = &PPC::QFRCRegClass; break;
|
|
case MVT::v4f32: RC = &PPC::QSRCRegClass; break;
|
|
default: RC = &PPC::QBRCRegClass; break;
|
|
}
|
|
|
|
unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
++QFPR_idx;
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputeArgOffset();
|
|
needsLoad = true;
|
|
}
|
|
if (CallConv != CallingConv::Fast || needsLoad)
|
|
ArgOffset += Sz;
|
|
break;
|
|
}
|
|
|
|
// We need to load the argument to a virtual register if we determined
|
|
// above that we ran out of physical registers of the appropriate type.
|
|
if (needsLoad) {
|
|
if (ObjSize < ArgSize && !isLittleEndian)
|
|
CurArgOffset += ArgSize - ObjSize;
|
|
int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
|
|
}
|
|
|
|
InVals.push_back(ArgVal);
|
|
}
|
|
|
|
// Area that is at least reserved in the caller of this function.
|
|
unsigned MinReservedArea;
|
|
if (HasParameterArea)
|
|
MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
|
|
else
|
|
MinReservedArea = LinkageSize;
|
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
|
// call optimized functions' reserved stack space needs to be aligned so that
|
|
// taking the difference between two stack areas will result in an aligned
|
|
// stack.
|
|
MinReservedArea =
|
|
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
if (isVarArg) {
|
|
int Depth = ArgOffset;
|
|
|
|
FuncInfo->setVarArgsFrameIndex(
|
|
MFI.CreateFixedObject(PtrByteSize, Depth, true));
|
|
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
|
|
|
// If this function is vararg, store any remaining integer argument regs
|
|
// to their spots on the stack so that they may be loaded by dereferencing
|
|
// the result of va_next.
|
|
for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
|
|
GPR_idx < Num_GPR_Regs; ++GPR_idx) {
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
SDValue Store =
|
|
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
|
|
MemOps.push_back(Store);
|
|
// Increment the address by four for the next argument to store
|
|
SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
}
|
|
}
|
|
|
|
if (!MemOps.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
|
|
|
|
return Chain;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFormalArguments_Darwin(
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
// TODO: add description of PPC stack frame format, or at least some docs.
|
|
//
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
bool isPPC64 = PtrVT == MVT::i64;
|
|
// Potential tail calls could cause overwriting of argument stack slots.
|
|
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
(CallConv == CallingConv::Fast));
|
|
unsigned PtrByteSize = isPPC64 ? 8 : 4;
|
|
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
unsigned ArgOffset = LinkageSize;
|
|
// Area that is at least reserved in caller of this function.
|
|
unsigned MinReservedArea = ArgOffset;
|
|
|
|
static const MCPhysReg GPR_32[] = { // 32-bit registers.
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
};
|
|
static const MCPhysReg GPR_64[] = { // 64-bit registers.
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
};
|
|
static const MCPhysReg VR[] = {
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
};
|
|
|
|
const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
|
|
const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
|
|
const unsigned Num_VR_Regs = array_lengthof( VR);
|
|
|
|
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
|
|
|
|
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
|
|
|
|
// In 32-bit non-varargs functions, the stack space for vectors is after the
|
|
// stack space for non-vectors. We do not use this space unless we have
|
|
// too many vectors to fit in registers, something that only occurs in
|
|
// constructed examples:), but we have to walk the arglist to figure
|
|
// that out...for the pathological case, compute VecArgOffset as the
|
|
// start of the vector parameter area. Computing VecArgOffset is the
|
|
// entire point of the following loop.
|
|
unsigned VecArgOffset = ArgOffset;
|
|
if (!isVarArg && !isPPC64) {
|
|
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
|
|
++ArgNo) {
|
|
EVT ObjectVT = Ins[ArgNo].VT;
|
|
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
|
|
|
|
if (Flags.isByVal()) {
|
|
// ObjSize is the true size, ArgSize rounded up to multiple of regs.
|
|
unsigned ObjSize = Flags.getByValSize();
|
|
unsigned ArgSize =
|
|
((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
VecArgOffset += ArgSize;
|
|
continue;
|
|
}
|
|
|
|
switch(ObjectVT.getSimpleVT().SimpleTy) {
|
|
default: llvm_unreachable("Unhandled argument type!");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
case MVT::f32:
|
|
VecArgOffset += 4;
|
|
break;
|
|
case MVT::i64: // PPC64
|
|
case MVT::f64:
|
|
// FIXME: We are guaranteed to be !isPPC64 at this point.
|
|
// Does MVT::i64 apply?
|
|
VecArgOffset += 8;
|
|
break;
|
|
case MVT::v4f32:
|
|
case MVT::v4i32:
|
|
case MVT::v8i16:
|
|
case MVT::v16i8:
|
|
// Nothing to do, we're only looking at Nonvector args here.
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// We've found where the vector parameter area in memory is. Skip the
|
|
// first 12 parameters; these don't use that memory.
|
|
VecArgOffset = ((VecArgOffset+15)/16)*16;
|
|
VecArgOffset += 12*16;
|
|
|
|
// Add DAG nodes to load the arguments or copy them out of registers. On
|
|
// entry to a function on PPC, the arguments start after the linkage area,
|
|
// although the first ones are often in registers.
|
|
|
|
SmallVector<SDValue, 8> MemOps;
|
|
unsigned nAltivecParamsAtEnd = 0;
|
|
Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
|
|
unsigned CurArgIdx = 0;
|
|
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
|
|
SDValue ArgVal;
|
|
bool needsLoad = false;
|
|
EVT ObjectVT = Ins[ArgNo].VT;
|
|
unsigned ObjSize = ObjectVT.getSizeInBits()/8;
|
|
unsigned ArgSize = ObjSize;
|
|
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
|
|
if (Ins[ArgNo].isOrigArg()) {
|
|
std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
|
|
CurArgIdx = Ins[ArgNo].getOrigArgIndex();
|
|
}
|
|
unsigned CurArgOffset = ArgOffset;
|
|
|
|
// Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
|
|
if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
|
|
ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
|
|
if (isVarArg || isPPC64) {
|
|
MinReservedArea = ((MinReservedArea+15)/16)*16;
|
|
MinReservedArea += CalculateStackSlotSize(ObjectVT,
|
|
Flags,
|
|
PtrByteSize);
|
|
} else nAltivecParamsAtEnd++;
|
|
} else
|
|
// Calculate min reserved area.
|
|
MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
|
|
Flags,
|
|
PtrByteSize);
|
|
|
|
// FIXME the codegen can be much improved in some cases.
|
|
// We do not have to keep everything in memory.
|
|
if (Flags.isByVal()) {
|
|
assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
|
|
|
|
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
|
|
ObjSize = Flags.getByValSize();
|
|
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
// Objects of size 1 and 2 are right justified, everything else is
|
|
// left justified. This means the memory address is adjusted forwards.
|
|
if (ObjSize==1 || ObjSize==2) {
|
|
CurArgOffset = CurArgOffset + (4 - ObjSize);
|
|
}
|
|
// The value of the object is its address.
|
|
int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true);
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
InVals.push_back(FIN);
|
|
if (ObjSize==1 || ObjSize==2) {
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
unsigned VReg;
|
|
if (isPPC64)
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
else
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
|
|
SDValue Store =
|
|
DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
|
|
MachinePointerInfo(&*FuncArg), ObjType);
|
|
MemOps.push_back(Store);
|
|
++GPR_idx;
|
|
}
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
continue;
|
|
}
|
|
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
|
|
// Store whatever pieces of the object are in registers
|
|
// to memory. ArgOffset will be the address of the beginning
|
|
// of the object.
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
unsigned VReg;
|
|
if (isPPC64)
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
else
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
|
int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
|
MachinePointerInfo(&*FuncArg, j));
|
|
MemOps.push_back(Store);
|
|
++GPR_idx;
|
|
ArgOffset += PtrByteSize;
|
|
} else {
|
|
ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
switch (ObjectVT.getSimpleVT().SimpleTy) {
|
|
default: llvm_unreachable("Unhandled argument type!");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
if (!isPPC64) {
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
|
|
|
|
if (ObjectVT == MVT::i1)
|
|
ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
|
|
|
|
++GPR_idx;
|
|
} else {
|
|
needsLoad = true;
|
|
ArgSize = PtrByteSize;
|
|
}
|
|
// All int arguments reserve stack space in the Darwin ABI.
|
|
ArgOffset += PtrByteSize;
|
|
break;
|
|
}
|
|
LLVM_FALLTHROUGH;
|
|
case MVT::i64: // PPC64
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
|
|
|
|
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
|
|
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
|
|
// value to MVT::i64 and then truncate to the correct register size.
|
|
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
|
|
|
|
++GPR_idx;
|
|
} else {
|
|
needsLoad = true;
|
|
ArgSize = PtrByteSize;
|
|
}
|
|
// All int arguments reserve stack space in the Darwin ABI.
|
|
ArgOffset += 8;
|
|
break;
|
|
|
|
case MVT::f32:
|
|
case MVT::f64:
|
|
// Every 4 bytes of argument space consumes one of the GPRs available for
|
|
// argument passing.
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
++GPR_idx;
|
|
if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
|
|
++GPR_idx;
|
|
}
|
|
if (FPR_idx != Num_FPR_Regs) {
|
|
unsigned VReg;
|
|
|
|
if (ObjectVT == MVT::f32)
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
|
|
else
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
|
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
++FPR_idx;
|
|
} else {
|
|
needsLoad = true;
|
|
}
|
|
|
|
// All FP arguments reserve stack space in the Darwin ABI.
|
|
ArgOffset += isPPC64 ? 8 : ObjSize;
|
|
break;
|
|
case MVT::v4f32:
|
|
case MVT::v4i32:
|
|
case MVT::v8i16:
|
|
case MVT::v16i8:
|
|
// Note that vector arguments in registers don't reserve stack space,
|
|
// except in varargs functions.
|
|
if (VR_idx != Num_VR_Regs) {
|
|
unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
if (isVarArg) {
|
|
while ((ArgOffset % 16) != 0) {
|
|
ArgOffset += PtrByteSize;
|
|
if (GPR_idx != Num_GPR_Regs)
|
|
GPR_idx++;
|
|
}
|
|
ArgOffset += 16;
|
|
GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
|
|
}
|
|
++VR_idx;
|
|
} else {
|
|
if (!isVarArg && !isPPC64) {
|
|
// Vectors go after all the nonvectors.
|
|
CurArgOffset = VecArgOffset;
|
|
VecArgOffset += 16;
|
|
} else {
|
|
// Vectors are aligned.
|
|
ArgOffset = ((ArgOffset+15)/16)*16;
|
|
CurArgOffset = ArgOffset;
|
|
ArgOffset += 16;
|
|
}
|
|
needsLoad = true;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// We need to load the argument to a virtual register if we determined above
|
|
// that we ran out of physical registers of the appropriate type.
|
|
if (needsLoad) {
|
|
int FI = MFI.CreateFixedObject(ObjSize,
|
|
CurArgOffset + (ArgSize - ObjSize),
|
|
isImmutable);
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
|
|
}
|
|
|
|
InVals.push_back(ArgVal);
|
|
}
|
|
|
|
// Allow for Altivec parameters at the end, if needed.
|
|
if (nAltivecParamsAtEnd) {
|
|
MinReservedArea = ((MinReservedArea+15)/16)*16;
|
|
MinReservedArea += 16*nAltivecParamsAtEnd;
|
|
}
|
|
|
|
// Area that is at least reserved in the caller of this function.
|
|
MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
|
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
|
// call optimized functions' reserved stack space needs to be aligned so that
|
|
// taking the difference between two stack areas will result in an aligned
|
|
// stack.
|
|
MinReservedArea =
|
|
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
if (isVarArg) {
|
|
int Depth = ArgOffset;
|
|
|
|
FuncInfo->setVarArgsFrameIndex(
|
|
MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
|
|
Depth, true));
|
|
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
|
|
|
// If this function is vararg, store any remaining integer argument regs
|
|
// to their spots on the stack so that they may be loaded by dereferencing
|
|
// the result of va_next.
|
|
for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
|
|
unsigned VReg;
|
|
|
|
if (isPPC64)
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
else
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
SDValue Store =
|
|
DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
|
|
MemOps.push_back(Store);
|
|
// Increment the address by four for the next argument to store
|
|
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
}
|
|
}
|
|
|
|
if (!MemOps.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
|
|
|
|
return Chain;
|
|
}
|
|
|
|
/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
|
|
/// adjusted to accommodate the arguments for the tailcall.
|
|
static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
|
|
unsigned ParamSize) {
|
|
|
|
if (!isTailCall) return 0;
|
|
|
|
PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
|
|
unsigned CallerMinReservedArea = FI->getMinReservedArea();
|
|
int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
|
|
// Remember only if the new adjustment is bigger.
|
|
if (SPDiff < FI->getTailCallSPDelta())
|
|
FI->setTailCallSPDelta(SPDiff);
|
|
|
|
return SPDiff;
|
|
}
|
|
|
|
static bool isFunctionGlobalAddress(SDValue Callee);
|
|
|
|
static bool
|
|
callsShareTOCBase(const Function *Caller, SDValue Callee,
|
|
const TargetMachine &TM) {
|
|
// Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
|
|
// don't have enough information to determine if the caller and calle share
|
|
// the same TOC base, so we have to pessimistically assume they don't for
|
|
// correctness.
|
|
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
|
|
if (!G)
|
|
return false;
|
|
|
|
const GlobalValue *GV = G->getGlobal();
|
|
// The medium and large code models are expected to provide a sufficiently
|
|
// large TOC to provide all data addressing needs of a module with a
|
|
// single TOC. Since each module will be addressed with a single TOC then we
|
|
// only need to check that caller and callee don't cross dso boundaries.
|
|
if (CodeModel::Medium == TM.getCodeModel() ||
|
|
CodeModel::Large == TM.getCodeModel())
|
|
return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);
|
|
|
|
// Otherwise we need to ensure callee and caller are in the same section,
|
|
// since the linker may allocate multiple TOCs, and we don't know which
|
|
// sections will belong to the same TOC base.
|
|
|
|
if (!GV->isStrongDefinitionForLinker())
|
|
return false;
|
|
|
|
// Any explicitly-specified sections and section prefixes must also match.
|
|
// Also, if we're using -ffunction-sections, then each function is always in
|
|
// a different section (the same is true for COMDAT functions).
|
|
if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() ||
|
|
GV->getSection() != Caller->getSection())
|
|
return false;
|
|
if (const auto *F = dyn_cast<Function>(GV)) {
|
|
if (F->getSectionPrefix() != Caller->getSectionPrefix())
|
|
return false;
|
|
}
|
|
|
|
// If the callee might be interposed, then we can't assume the ultimate call
|
|
// target will be in the same section. Even in cases where we can assume that
|
|
// interposition won't happen, in any case where the linker might insert a
|
|
// stub to allow for interposition, we must generate code as though
|
|
// interposition might occur. To understand why this matters, consider a
|
|
// situation where: a -> b -> c where the arrows indicate calls. b and c are
|
|
// in the same section, but a is in a different module (i.e. has a different
|
|
// TOC base pointer). If the linker allows for interposition between b and c,
|
|
// then it will generate a stub for the call edge between b and c which will
|
|
// save the TOC pointer into the designated stack slot allocated by b. If we
|
|
// return true here, and therefore allow a tail call between b and c, that
|
|
// stack slot won't exist and the b -> c stub will end up saving b'c TOC base
|
|
// pointer into the stack slot allocated by a (where the a -> b stub saved
|
|
// a's TOC base pointer). If we're not considering a tail call, but rather,
|
|
// whether a nop is needed after the call instruction in b, because the linker
|
|
// will insert a stub, it might complain about a missing nop if we omit it
|
|
// (although many don't complain in this case).
|
|
if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
needStackSlotPassParameters(const PPCSubtarget &Subtarget,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs) {
|
|
assert(Subtarget.is64BitELFABI());
|
|
|
|
const unsigned PtrByteSize = 8;
|
|
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
|
|
static const MCPhysReg GPR[] = {
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
};
|
|
static const MCPhysReg VR[] = {
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
};
|
|
|
|
const unsigned NumGPRs = array_lengthof(GPR);
|
|
const unsigned NumFPRs = 13;
|
|
const unsigned NumVRs = array_lengthof(VR);
|
|
const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
|
|
|
|
unsigned NumBytes = LinkageSize;
|
|
unsigned AvailableFPRs = NumFPRs;
|
|
unsigned AvailableVRs = NumVRs;
|
|
|
|
for (const ISD::OutputArg& Param : Outs) {
|
|
if (Param.Flags.isNest()) continue;
|
|
|
|
if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
|
|
PtrByteSize, LinkageSize, ParamAreaSize,
|
|
NumBytes, AvailableFPRs, AvailableVRs,
|
|
Subtarget.hasQPX()))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
|
|
if (CS.arg_size() != CallerFn->arg_size())
|
|
return false;
|
|
|
|
ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
|
|
ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
|
|
Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
|
|
|
|
for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
|
|
const Value* CalleeArg = *CalleeArgIter;
|
|
const Value* CallerArg = &(*CallerArgIter);
|
|
if (CalleeArg == CallerArg)
|
|
continue;
|
|
|
|
// e.g. @caller([4 x i64] %a, [4 x i64] %b) {
|
|
// tail call @callee([4 x i64] undef, [4 x i64] %b)
|
|
// }
|
|
// 1st argument of callee is undef and has the same type as caller.
|
|
if (CalleeArg->getType() == CallerArg->getType() &&
|
|
isa<UndefValue>(CalleeArg))
|
|
continue;
|
|
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Returns true if TCO is possible between the callers and callees
|
|
// calling conventions.
|
|
static bool
|
|
areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
|
|
CallingConv::ID CalleeCC) {
|
|
// Tail calls are possible with fastcc and ccc.
|
|
auto isTailCallableCC = [] (CallingConv::ID CC){
|
|
return CC == CallingConv::C || CC == CallingConv::Fast;
|
|
};
|
|
if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
|
|
return false;
|
|
|
|
// We can safely tail call both fastcc and ccc callees from a c calling
|
|
// convention caller. If the caller is fastcc, we may have less stack space
|
|
// than a non-fastcc caller with the same signature so disable tail-calls in
|
|
// that case.
|
|
return CallerCC == CallingConv::C || CallerCC == CalleeCC;
|
|
}
|
|
|
|
bool
|
|
PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
|
|
SDValue Callee,
|
|
CallingConv::ID CalleeCC,
|
|
ImmutableCallSite CS,
|
|
bool isVarArg,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
SelectionDAG& DAG) const {
|
|
bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
|
|
|
|
if (DisableSCO && !TailCallOpt) return false;
|
|
|
|
// Variadic argument functions are not supported.
|
|
if (isVarArg) return false;
|
|
|
|
auto &Caller = DAG.getMachineFunction().getFunction();
|
|
// Check that the calling conventions are compatible for tco.
|
|
if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC))
|
|
return false;
|
|
|
|
// Caller contains any byval parameter is not supported.
|
|
if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
|
|
return false;
|
|
|
|
// Callee contains any byval parameter is not supported, too.
|
|
// Note: This is a quick work around, because in some cases, e.g.
|
|
// caller's stack size > callee's stack size, we are still able to apply
|
|
// sibling call optimization. For example, gcc is able to do SCO for caller1
|
|
// in the following example, but not for caller2.
|
|
// struct test {
|
|
// long int a;
|
|
// char ary[56];
|
|
// } gTest;
|
|
// __attribute__((noinline)) int callee(struct test v, struct test *b) {
|
|
// b->a = v.a;
|
|
// return 0;
|
|
// }
|
|
// void caller1(struct test a, struct test c, struct test *b) {
|
|
// callee(gTest, b); }
|
|
// void caller2(struct test *b) { callee(gTest, b); }
|
|
if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
|
|
return false;
|
|
|
|
// If callee and caller use different calling conventions, we cannot pass
|
|
// parameters on stack since offsets for the parameter area may be different.
|
|
if (Caller.getCallingConv() != CalleeCC &&
|
|
needStackSlotPassParameters(Subtarget, Outs))
|
|
return false;
|
|
|
|
// No TCO/SCO on indirect call because Caller have to restore its TOC
|
|
if (!isFunctionGlobalAddress(Callee) &&
|
|
!isa<ExternalSymbolSDNode>(Callee))
|
|
return false;
|
|
|
|
// If the caller and callee potentially have different TOC bases then we
|
|
// cannot tail call since we need to restore the TOC pointer after the call.
|
|
// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
|
|
if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
|
|
return false;
|
|
|
|
// TCO allows altering callee ABI, so we don't have to check further.
|
|
if (CalleeCC == CallingConv::Fast && TailCallOpt)
|
|
return true;
|
|
|
|
if (DisableSCO) return false;
|
|
|
|
// If callee use the same argument list that caller is using, then we can
|
|
// apply SCO on this case. If it is not, then we need to check if callee needs
|
|
// stack for passing arguments.
|
|
if (!hasSameArgumentList(&Caller, CS) &&
|
|
needStackSlotPassParameters(Subtarget, Outs)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
|
|
/// for tail call optimization. Targets which want to do tail call
|
|
/// optimization should implement this function.
|
|
bool
|
|
PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
|
|
CallingConv::ID CalleeCC,
|
|
bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
SelectionDAG& DAG) const {
|
|
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
|
|
return false;
|
|
|
|
// Variable argument functions are not supported.
|
|
if (isVarArg)
|
|
return false;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
|
|
if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
|
|
// Functions containing by val parameters are not supported.
|
|
for (unsigned i = 0; i != Ins.size(); i++) {
|
|
ISD::ArgFlagsTy Flags = Ins[i].Flags;
|
|
if (Flags.isByVal()) return false;
|
|
}
|
|
|
|
// Non-PIC/GOT tail calls are supported.
|
|
if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
|
|
return true;
|
|
|
|
// At the moment we can only do local tail calls (in same module, hidden
|
|
// or protected) if we are generating PIC.
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
|
|
return G->getGlobal()->hasHiddenVisibility()
|
|
|| G->getGlobal()->hasProtectedVisibility();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// isCallCompatibleAddress - Return the immediate to use if the specified
|
|
/// 32-bit value is representable in the immediate field of a BxA instruction.
|
|
static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
|
|
if (!C) return nullptr;
|
|
|
|
int Addr = C->getZExtValue();
|
|
if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
|
|
SignExtend32<26>(Addr) != Addr)
|
|
return nullptr; // Top 6 bits have to be sext of immediate.
|
|
|
|
return DAG
|
|
.getConstant(
|
|
(int)C->getZExtValue() >> 2, SDLoc(Op),
|
|
DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
|
|
.getNode();
|
|
}
|
|
|
|
namespace {
|
|
|
|
struct TailCallArgumentInfo {
|
|
SDValue Arg;
|
|
SDValue FrameIdxOp;
|
|
int FrameIdx = 0;
|
|
|
|
TailCallArgumentInfo() = default;
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
|
|
static void StoreTailCallArgumentsToStackSlot(
|
|
SelectionDAG &DAG, SDValue Chain,
|
|
const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
|
|
SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
|
|
for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
|
|
SDValue Arg = TailCallArgs[i].Arg;
|
|
SDValue FIN = TailCallArgs[i].FrameIdxOp;
|
|
int FI = TailCallArgs[i].FrameIdx;
|
|
// Store relative to framepointer.
|
|
MemOpChains.push_back(DAG.getStore(
|
|
Chain, dl, Arg, FIN,
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
|
|
}
|
|
}
|
|
|
|
/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
|
|
/// the appropriate stack slot for the tail call optimized function call.
|
|
static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
|
|
SDValue OldRetAddr, SDValue OldFP,
|
|
int SPDiff, const SDLoc &dl) {
|
|
if (SPDiff) {
|
|
// Calculate the new stack slot for the return address.
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
|
|
const PPCFrameLowering *FL = Subtarget.getFrameLowering();
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
int SlotSize = isPPC64 ? 8 : 4;
|
|
int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
|
|
int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
|
|
NewRetAddrLoc, true);
|
|
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
|
|
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
|
|
Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
|
|
MachinePointerInfo::getFixedStack(MF, NewRetAddr));
|
|
|
|
// When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
|
|
// slot as the FP is never overwritten.
|
|
if (Subtarget.isDarwinABI()) {
|
|
int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
|
|
int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
|
|
true);
|
|
SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
|
|
Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
|
|
MachinePointerInfo::getFixedStack(
|
|
DAG.getMachineFunction(), NewFPIdx));
|
|
}
|
|
}
|
|
return Chain;
|
|
}
|
|
|
|
/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
|
|
/// the position of the argument.
|
|
static void
|
|
CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
|
|
SDValue Arg, int SPDiff, unsigned ArgOffset,
|
|
SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
|
|
int Offset = ArgOffset + SPDiff;
|
|
uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
|
|
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
|
|
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
|
|
SDValue FIN = DAG.getFrameIndex(FI, VT);
|
|
TailCallArgumentInfo Info;
|
|
Info.Arg = Arg;
|
|
Info.FrameIdxOp = FIN;
|
|
Info.FrameIdx = FI;
|
|
TailCallArguments.push_back(Info);
|
|
}
|
|
|
|
/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
|
|
/// stack slot. Returns the chain as result and the loaded frame pointers in
|
|
/// LROpOut/FPOpout. Used when tail calling.
|
|
SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
|
|
SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
|
|
SDValue &FPOpOut, const SDLoc &dl) const {
|
|
if (SPDiff) {
|
|
// Load the LR and FP stack slot for later adjusting.
|
|
EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
|
|
LROpOut = getReturnAddrFrameIndex(DAG);
|
|
LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
|
|
Chain = SDValue(LROpOut.getNode(), 1);
|
|
|
|
// When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
|
|
// slot as the FP is never overwritten.
|
|
if (Subtarget.isDarwinABI()) {
|
|
FPOpOut = getFramePointerFrameIndex(DAG);
|
|
FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
|
|
Chain = SDValue(FPOpOut.getNode(), 1);
|
|
}
|
|
}
|
|
return Chain;
|
|
}
|
|
|
|
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
|
|
/// by "Src" to address "Dst" of size "Size". Alignment information is
|
|
/// specified by the specific parameter attribute. The copy will be passed as
|
|
/// a byval function parameter.
|
|
/// Sometimes what we are copying is the end of a larger object, the part that
|
|
/// does not fit in registers.
|
|
static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
|
|
SDValue Chain, ISD::ArgFlagsTy Flags,
|
|
SelectionDAG &DAG, const SDLoc &dl) {
|
|
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
|
|
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
|
|
false, false, false, MachinePointerInfo(),
|
|
MachinePointerInfo());
|
|
}
|
|
|
|
/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
|
|
/// tail calls.
|
|
static void LowerMemOpCallTo(
|
|
SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
|
|
SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
|
|
bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
|
|
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
|
|
if (!isTailCall) {
|
|
if (isVector) {
|
|
SDValue StackPtr;
|
|
if (isPPC64)
|
|
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
|
|
else
|
|
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
|
|
DAG.getConstant(ArgOffset, dl, PtrVT));
|
|
}
|
|
MemOpChains.push_back(
|
|
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
|
|
// Calculate and remember argument location.
|
|
} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
|
|
TailCallArguments);
|
|
}
|
|
|
|
static void
|
|
PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
|
|
const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
|
|
SDValue FPOp,
|
|
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
|
|
// Emit a sequence of copyto/copyfrom virtual registers for arguments that
|
|
// might overwrite each other in case of tail call optimization.
|
|
SmallVector<SDValue, 8> MemOpChains2;
|
|
// Do not flag preceding copytoreg stuff together with the following stuff.
|
|
InFlag = SDValue();
|
|
StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
|
|
MemOpChains2, dl);
|
|
if (!MemOpChains2.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
|
|
|
|
// Store the return address to the appropriate stack slot.
|
|
Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
|
|
|
|
// Emit callseq_end just before tailcall node.
|
|
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
|
|
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
// Is this global address that of a function that can be called by name? (as
|
|
// opposed to something that must hold a descriptor for an indirect call).
|
|
static bool isFunctionGlobalAddress(SDValue Callee) {
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
|
|
if (Callee.getOpcode() == ISD::GlobalTLSAddress ||
|
|
Callee.getOpcode() == ISD::TargetGlobalTLSAddress)
|
|
return false;
|
|
|
|
return G->getGlobal()->getValueType()->isFunctionTy();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerCallResult(
|
|
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
|
CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
|
*DAG.getContext());
|
|
|
|
CCRetInfo.AnalyzeCallResult(
|
|
Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
|
|
? RetCC_PPC_Cold
|
|
: RetCC_PPC);
|
|
|
|
// Copy all of the result registers out of their specified physreg.
|
|
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
|
|
CCValAssign &VA = RVLocs[i];
|
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
|
|
|
SDValue Val;
|
|
|
|
if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
|
|
SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
|
|
InFlag);
|
|
Chain = Lo.getValue(1);
|
|
InFlag = Lo.getValue(2);
|
|
VA = RVLocs[++i]; // skip ahead to next loc
|
|
SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
|
|
InFlag);
|
|
Chain = Hi.getValue(1);
|
|
InFlag = Hi.getValue(2);
|
|
if (!Subtarget.isLittleEndian())
|
|
std::swap (Lo, Hi);
|
|
Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
|
|
} else {
|
|
Val = DAG.getCopyFromReg(Chain, dl,
|
|
VA.getLocReg(), VA.getLocVT(), InFlag);
|
|
Chain = Val.getValue(1);
|
|
InFlag = Val.getValue(2);
|
|
}
|
|
|
|
switch (VA.getLocInfo()) {
|
|
default: llvm_unreachable("Unknown loc info!");
|
|
case CCValAssign::Full: break;
|
|
case CCValAssign::AExt:
|
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
|
break;
|
|
case CCValAssign::ZExt:
|
|
Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
|
|
DAG.getValueType(VA.getValVT()));
|
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
|
break;
|
|
case CCValAssign::SExt:
|
|
Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
|
|
DAG.getValueType(VA.getValVT()));
|
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
|
break;
|
|
}
|
|
|
|
InVals.push_back(Val);
|
|
}
|
|
|
|
return Chain;
|
|
}
|
|
|
|
static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
|
|
const PPCSubtarget &Subtarget, bool isPatchPoint) {
|
|
// PatchPoint calls are not indirect.
|
|
if (isPatchPoint)
|
|
return false;
|
|
|
|
if (isFunctionGlobalAddress(Callee) || dyn_cast<ExternalSymbolSDNode>(Callee))
|
|
return false;
|
|
|
|
// Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
|
|
// becuase the immediate function pointer points to a descriptor instead of
|
|
// a function entry point. The ELFv2 ABI cannot use a BLA because the function
|
|
// pointer immediate points to the global entry point, while the BLA would
|
|
// need to jump to the local entry point (see rL211174).
|
|
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
|
|
isBLACompatibleAddress(Callee, DAG))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,
|
|
bool isTailCall, const Function &Caller,
|
|
const SDValue &Callee,
|
|
const PPCSubtarget &Subtarget,
|
|
const TargetMachine &TM) {
|
|
if (isTailCall)
|
|
return PPCISD::TC_RETURN;
|
|
|
|
// This is a call through a function pointer.
|
|
if (isIndirectCall) {
|
|
// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
|
|
// indirect calls. The save of the caller's TOC pointer to the stack will be
|
|
// inserted into the DAG as part of call lowering. The restore of the TOC
|
|
// pointer is modeled by using a pseudo instruction for the call opcode that
|
|
// represents the 2 instruction sequence of an indirect branch and link,
|
|
// immediately followed by a load of the TOC pointer from the the stack save
|
|
// slot into gpr2.
|
|
if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
|
|
return PPCISD::BCTRL_LOAD_TOC;
|
|
|
|
// An indirect call that does not need a TOC restore.
|
|
return PPCISD::BCTRL;
|
|
}
|
|
|
|
// The ABIs that maintain a TOC pointer accross calls need to have a nop
|
|
// immediately following the call instruction if the caller and callee may
|
|
// have different TOC bases. At link time if the linker determines the calls
|
|
// may not share a TOC base, the call is redirected to a trampoline inserted
|
|
// by the linker. The trampoline will (among other things) save the callers
|
|
// TOC pointer at an ABI designated offset in the linkage area and the linker
|
|
// will rewrite the nop to be a load of the TOC pointer from the linkage area
|
|
// into gpr2.
|
|
if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
|
|
return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
|
|
: PPCISD::CALL_NOP;
|
|
|
|
return PPCISD::CALL;
|
|
}
|
|
static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
|
|
const SDLoc &dl, const PPCSubtarget &Subtarget) {
|
|
if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
|
|
if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
|
|
return SDValue(Dest, 0);
|
|
|
|
// Returns true if the callee is local, and false otherwise.
|
|
auto isLocalCallee = [&]() {
|
|
const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
|
|
const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
|
|
const GlobalValue *GV = G ? G->getGlobal() : nullptr;
|
|
|
|
return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
|
|
!dyn_cast_or_null<GlobalIFunc>(GV);
|
|
};
|
|
|
|
// The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
|
|
// a static relocation model causes some versions of GNU LD (2.17.50, at
|
|
// least) to force BSS-PLT, instead of secure-PLT, even if all objects are
|
|
// built with secure-PLT.
|
|
bool UsePlt =
|
|
Subtarget.is32BitELFABI() && !isLocalCallee() &&
|
|
Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
|
|
|
|
if (isFunctionGlobalAddress(Callee)) {
|
|
const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
|
|
if (!Subtarget.isAIXABI())
|
|
return DAG.getTargetGlobalAddress(G->getGlobal(), dl,
|
|
Callee.getValueType(), 0,
|
|
UsePlt ? PPCII::MO_PLT : 0);
|
|
|
|
// On AIX, direct function calls reference the symbol for the function's
|
|
// entry point, which is named by prepending a "." before the function's
|
|
// C-linkage name.
|
|
auto &Context = DAG.getMachineFunction().getMMI().getContext();
|
|
|
|
const GlobalObject *GO = cast<GlobalObject>(G->getGlobal());
|
|
MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
|
|
Context.getOrCreateSymbol(Twine(".") + Twine(GO->getName())));
|
|
|
|
if (GO && GO->isDeclaration() && !S->hasContainingCsect()) {
|
|
// On AIX, an undefined symbol needs to be associated with a
|
|
// MCSectionXCOFF to get the correct storage mapping class.
|
|
// In this case, XCOFF::XMC_PR.
|
|
const XCOFF::StorageClass SC =
|
|
TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
|
|
MCSectionXCOFF *Sec =
|
|
Context.getXCOFFSection(S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER,
|
|
SC, SectionKind::getMetadata());
|
|
S->setContainingCsect(Sec);
|
|
}
|
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
|
|
return DAG.getMCSymbol(S, PtrVT);
|
|
}
|
|
|
|
if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
|
|
return DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
|
|
UsePlt ? PPCII::MO_PLT : 0);
|
|
|
|
// No transformation needed.
|
|
assert(Callee.getNode() && "What no callee?");
|
|
return Callee;
|
|
}
|
|
|
|
static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
|
|
assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
|
|
"Expected a CALLSEQ_STARTSDNode.");
|
|
|
|
// The last operand is the chain, except when the node has glue. If the node
|
|
// has glue, then the last operand is the glue, and the chain is the second
|
|
// last operand.
|
|
SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);
|
|
if (LastValue.getValueType() != MVT::Glue)
|
|
return LastValue;
|
|
|
|
return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);
|
|
}
|
|
|
|
// Creates the node that moves a functions address into the count register
|
|
// to prepare for an indirect call instruction.
|
|
static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
|
|
SDValue &Glue, SDValue &Chain,
|
|
const SDLoc &dl) {
|
|
SDValue MTCTROps[] = {Chain, Callee, Glue};
|
|
EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
|
|
Chain = DAG.getNode(PPCISD::MTCTR, dl, makeArrayRef(ReturnTypes, 2),
|
|
makeArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
|
|
// The glue is the second value produced.
|
|
Glue = Chain.getValue(1);
|
|
}
|
|
|
|
static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
|
|
SDValue &Glue, SDValue &Chain,
|
|
SDValue CallSeqStart,
|
|
ImmutableCallSite CS, const SDLoc &dl,
|
|
bool hasNest,
|
|
const PPCSubtarget &Subtarget) {
|
|
// Function pointers in the 64-bit SVR4 ABI do not point to the function
|
|
// entry point, but to the function descriptor (the function entry point
|
|
// address is part of the function descriptor though).
|
|
// The function descriptor is a three doubleword structure with the
|
|
// following fields: function entry point, TOC base address and
|
|
// environment pointer.
|
|
// Thus for a call through a function pointer, the following actions need
|
|
// to be performed:
|
|
// 1. Save the TOC of the caller in the TOC save area of its stack
|
|
// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
|
|
// 2. Load the address of the function entry point from the function
|
|
// descriptor.
|
|
// 3. Load the TOC of the callee from the function descriptor into r2.
|
|
// 4. Load the environment pointer from the function descriptor into
|
|
// r11.
|
|
// 5. Branch to the function entry point address.
|
|
// 6. On return of the callee, the TOC of the caller needs to be
|
|
// restored (this is done in FinishCall()).
|
|
//
|
|
// The loads are scheduled at the beginning of the call sequence, and the
|
|
// register copies are flagged together to ensure that no other
|
|
// operations can be scheduled in between. E.g. without flagging the
|
|
// copies together, a TOC access in the caller could be scheduled between
|
|
// the assignment of the callee TOC and the branch to the callee, which leads
|
|
// to incorrect code.
|
|
|
|
// Start by loading the function address from the descriptor.
|
|
SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
|
|
auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
|
|
? (MachineMemOperand::MODereferenceable |
|
|
MachineMemOperand::MOInvariant)
|
|
: MachineMemOperand::MONone;
|
|
|
|
MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
|
|
|
|
// Registers used in building the DAG.
|
|
const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
|
|
const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
|
|
|
|
// Offsets of descriptor members.
|
|
const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
|
|
const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
|
|
|
|
const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
|
|
const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4;
|
|
|
|
// One load for the functions entry point address.
|
|
SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,
|
|
Alignment, MMOFlags);
|
|
|
|
// One for loading the TOC anchor for the module that contains the called
|
|
// function.
|
|
SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);
|
|
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);
|
|
SDValue TOCPtr =
|
|
DAG.getLoad(RegVT, dl, LDChain, AddTOC,
|
|
MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);
|
|
|
|
// One for loading the environment pointer.
|
|
SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);
|
|
SDValue LoadEnvPtr =
|
|
DAG.getLoad(RegVT, dl, LDChain, AddPtr,
|
|
MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);
|
|
|
|
|
|
// Then copy the newly loaded TOC anchor to the TOC pointer.
|
|
SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);
|
|
Chain = TOCVal.getValue(0);
|
|
Glue = TOCVal.getValue(1);
|
|
|
|
// If the function call has an explicit 'nest' parameter, it takes the
|
|
// place of the environment pointer.
|
|
assert((!hasNest || !Subtarget.isAIXABI()) &&
|
|
"Nest parameter is not supported on AIX.");
|
|
if (!hasNest) {
|
|
SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);
|
|
Chain = EnvVal.getValue(0);
|
|
Glue = EnvVal.getValue(1);
|
|
}
|
|
|
|
// The rest of the indirect call sequence is the same as the non-descriptor
|
|
// DAG.
|
|
prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);
|
|
}
|
|
|
|
static void
|
|
buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
|
|
const SDLoc &dl, bool isTailCall, bool isVarArg,
|
|
bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
|
|
SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
|
|
const PPCSubtarget &Subtarget, bool isIndirect) {
|
|
const bool IsPPC64 = Subtarget.isPPC64();
|
|
// MVT for a general purpose register.
|
|
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
|
|
|
|
// First operand is always the chain.
|
|
Ops.push_back(Chain);
|
|
|
|
// If it's a direct call pass the callee as the second operand.
|
|
if (!isIndirect)
|
|
Ops.push_back(Callee);
|
|
else {
|
|
assert(!isPatchPoint && "Patch point call are not indirect.");
|
|
|
|
// For the TOC based ABIs, we have saved the TOC pointer to the linkage area
|
|
// on the stack (this would have been done in `LowerCall_64SVR4` or
|
|
// `LowerCall_AIX`). The call instruction is a pseudo instruction that
|
|
// represents both the indirect branch and a load that restores the TOC
|
|
// pointer from the linkage area. The operand for the TOC restore is an add
|
|
// of the TOC save offset to the stack pointer. This must be the second
|
|
// operand: after the chain input but before any other variadic arguments.
|
|
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
|
|
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
|
|
|
|
SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
|
|
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
|
|
SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
|
|
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);
|
|
Ops.push_back(AddTOC);
|
|
}
|
|
|
|
// Add the register used for the environment pointer.
|
|
if (Subtarget.usesFunctionDescriptors() && !hasNest)
|
|
Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
|
|
RegVT));
|
|
|
|
|
|
// Add CTR register as callee so a bctr can be emitted later.
|
|
if (isTailCall)
|
|
Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
|
|
}
|
|
|
|
// If this is a tail call add stack pointer delta.
|
|
if (isTailCall)
|
|
Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
|
|
|
|
// Add argument registers to the end of the list so that they are known live
|
|
// into the call.
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
|
|
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
|
|
RegsToPass[i].second.getValueType()));
|
|
|
|
// We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
|
|
// no way to mark dependencies as implicit here.
|
|
// We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
|
|
if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) && !isPatchPoint)
|
|
Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));
|
|
|
|
// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
|
|
if (isVarArg && Subtarget.is32BitELFABI())
|
|
Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
|
|
|
|
// Add a register mask operand representing the call-preserved registers.
|
|
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
|
|
const uint32_t *Mask =
|
|
TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
|
|
assert(Mask && "Missing call preserved mask for calling convention");
|
|
Ops.push_back(DAG.getRegisterMask(Mask));
|
|
|
|
// If the glue is valid, it is the last operand.
|
|
if (Glue.getNode())
|
|
Ops.push_back(Glue);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::FinishCall(
|
|
CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
|
|
bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
|
|
SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
|
|
unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
|
|
SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
|
|
|
|
if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI())
|
|
setUsesTOCBasePtr(DAG);
|
|
|
|
const bool isIndirect = isIndirectCall(Callee, DAG, Subtarget, isPatchPoint);
|
|
unsigned CallOpc = getCallOpcode(isIndirect, isPatchPoint, isTailCall,
|
|
DAG.getMachineFunction().getFunction(),
|
|
Callee, Subtarget, DAG.getTarget());
|
|
|
|
if (!isIndirect)
|
|
Callee = transformCallee(Callee, DAG, dl, Subtarget);
|
|
else if (Subtarget.usesFunctionDescriptors())
|
|
prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CS,
|
|
dl, hasNest, Subtarget);
|
|
else
|
|
prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
|
|
|
|
// Build the operand list for the call instruction.
|
|
SmallVector<SDValue, 8> Ops;
|
|
buildCallOperands(Ops, CallConv, dl, isTailCall, isVarArg, isPatchPoint,
|
|
hasNest, DAG, RegsToPass, Glue, Chain, Callee, SPDiff,
|
|
Subtarget, isIndirect);
|
|
|
|
// Emit tail call.
|
|
if (isTailCall) {
|
|
assert(((Callee.getOpcode() == ISD::Register &&
|
|
cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
|
|
Callee.getOpcode() == ISD::TargetExternalSymbol ||
|
|
Callee.getOpcode() == ISD::TargetGlobalAddress ||
|
|
isa<ConstantSDNode>(Callee)) &&
|
|
"Expecting a global address, external symbol, absolute value or "
|
|
"register");
|
|
assert(CallOpc == PPCISD::TC_RETURN &&
|
|
"Unexpected call opcode for a tail call.");
|
|
DAG.getMachineFunction().getFrameInfo().setHasTailCall();
|
|
return DAG.getNode(CallOpc, dl, MVT::Other, Ops);
|
|
}
|
|
|
|
std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
|
|
Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
|
|
Glue = Chain.getValue(1);
|
|
|
|
// When performing tail call optimization the callee pops its arguments off
|
|
// the stack. Account for this here so these bytes can be pushed back on in
|
|
// PPCFrameLowering::eliminateCallFramePseudoInstr.
|
|
int BytesCalleePops = (CallConv == CallingConv::Fast &&
|
|
getTargetMachine().Options.GuaranteedTailCallOpt)
|
|
? NumBytes
|
|
: 0;
|
|
|
|
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
|
|
DAG.getIntPtrConstant(BytesCalleePops, dl, true),
|
|
Glue, dl);
|
|
Glue = Chain.getValue(1);
|
|
|
|
return LowerCallResult(Chain, Glue, CallConv, isVarArg, Ins, dl, DAG, InVals);
|
|
}
|
|
|
|
SDValue
|
|
PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
SelectionDAG &DAG = CLI.DAG;
|
|
SDLoc &dl = CLI.DL;
|
|
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
|
|
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
|
|
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
|
|
SDValue Chain = CLI.Chain;
|
|
SDValue Callee = CLI.Callee;
|
|
bool &isTailCall = CLI.IsTailCall;
|
|
CallingConv::ID CallConv = CLI.CallConv;
|
|
bool isVarArg = CLI.IsVarArg;
|
|
bool isPatchPoint = CLI.IsPatchPoint;
|
|
ImmutableCallSite CS = CLI.CS;
|
|
|
|
if (isTailCall) {
|
|
if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
|
|
isTailCall = false;
|
|
else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
|
|
isTailCall =
|
|
IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
|
|
isVarArg, Outs, Ins, DAG);
|
|
else
|
|
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
|
|
Ins, DAG);
|
|
if (isTailCall) {
|
|
++NumTailCalls;
|
|
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
|
|
++NumSiblingCalls;
|
|
|
|
assert(isa<GlobalAddressSDNode>(Callee) &&
|
|
"Callee should be an llvm::Function object.");
|
|
LLVM_DEBUG(
|
|
const GlobalValue *GV =
|
|
cast<GlobalAddressSDNode>(Callee)->getGlobal();
|
|
const unsigned Width =
|
|
80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
|
|
dbgs() << "TCO caller: "
|
|
<< left_justify(DAG.getMachineFunction().getName(), Width)
|
|
<< ", callee linkage: " << GV->getVisibility() << ", "
|
|
<< GV->getLinkage() << "\n");
|
|
}
|
|
}
|
|
|
|
if (!isTailCall && CS && CS.isMustTailCall())
|
|
report_fatal_error("failed to perform tail call elimination on a call "
|
|
"site marked musttail");
|
|
|
|
// When long calls (i.e. indirect calls) are always used, calls are always
|
|
// made via function pointer. If we have a function name, first translate it
|
|
// into a pointer.
|
|
if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
|
|
!isTailCall)
|
|
Callee = LowerGlobalAddress(Callee, DAG);
|
|
|
|
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
|
|
return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
|
|
isTailCall, isPatchPoint, Outs, OutVals, Ins,
|
|
dl, DAG, InVals, CS);
|
|
|
|
if (Subtarget.isSVR4ABI())
|
|
return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
|
|
isTailCall, isPatchPoint, Outs, OutVals, Ins,
|
|
dl, DAG, InVals, CS);
|
|
|
|
if (Subtarget.isAIXABI())
|
|
return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,
|
|
isTailCall, isPatchPoint, Outs, OutVals, Ins,
|
|
dl, DAG, InVals, CS);
|
|
|
|
return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
|
|
isTailCall, isPatchPoint, Outs, OutVals, Ins,
|
|
dl, DAG, InVals, CS);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerCall_32SVR4(
|
|
SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
|
|
bool isTailCall, bool isPatchPoint,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
|
|
ImmutableCallSite CS) const {
|
|
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
|
|
// of the 32-bit SVR4 ABI stack frame layout.
|
|
|
|
assert((CallConv == CallingConv::C ||
|
|
CallConv == CallingConv::Cold ||
|
|
CallConv == CallingConv::Fast) && "Unknown calling convention!");
|
|
|
|
unsigned PtrByteSize = 4;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
// Mark this function as potentially containing a function that contains a
|
|
// tail call. As a consequence the frame pointer will be used for dynamicalloc
|
|
// and restoring the callers stack pointer in this functions epilog. This is
|
|
// done because by tail calling the called function might overwrite the value
|
|
// in this function's (MF) stack pointer stack slot 0(SP).
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
CallConv == CallingConv::Fast)
|
|
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
|
|
|
|
// Count how many bytes are to be pushed on the stack, including the linkage
|
|
// area, parameter list area and the part of the local variable space which
|
|
// contains copies of aggregates which are passed by value.
|
|
|
|
// Assign locations to all of the outgoing arguments.
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
|
|
|
|
// Reserve space for the linkage area on the stack.
|
|
CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
|
|
PtrByteSize);
|
|
if (useSoftFloat())
|
|
CCInfo.PreAnalyzeCallOperands(Outs);
|
|
|
|
if (isVarArg) {
|
|
// Handle fixed and variable vector arguments differently.
|
|
// Fixed vector arguments go into registers as long as registers are
|
|
// available. Variable vector arguments always go into memory.
|
|
unsigned NumArgs = Outs.size();
|
|
|
|
for (unsigned i = 0; i != NumArgs; ++i) {
|
|
MVT ArgVT = Outs[i].VT;
|
|
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
|
|
bool Result;
|
|
|
|
if (Outs[i].IsFixed) {
|
|
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
|
|
CCInfo);
|
|
} else {
|
|
Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
|
|
ArgFlags, CCInfo);
|
|
}
|
|
|
|
if (Result) {
|
|
#ifndef NDEBUG
|
|
errs() << "Call operand #" << i << " has unhandled type "
|
|
<< EVT(ArgVT).getEVTString() << "\n";
|
|
#endif
|
|
llvm_unreachable(nullptr);
|
|
}
|
|
}
|
|
} else {
|
|
// All arguments are treated the same.
|
|
CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
|
|
}
|
|
CCInfo.clearWasPPCF128();
|
|
|
|
// Assign locations to all of the outgoing aggregate by value arguments.
|
|
SmallVector<CCValAssign, 16> ByValArgLocs;
|
|
CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
|
|
|
|
// Reserve stack space for the allocations in CCInfo.
|
|
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
|
|
|
|
CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
|
|
|
|
// Size of the linkage area, parameter list area and the part of the local
|
|
// space variable where copies of aggregates which are passed by value are
|
|
// stored.
|
|
unsigned NumBytes = CCByValInfo.getNextStackOffset();
|
|
|
|
// Calculate by how many bytes the stack has to be adjusted in case of tail
|
|
// call optimization.
|
|
int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
|
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
|
|
SDValue CallSeqStart = Chain;
|
|
|
|
// Load the return address and frame pointer so it can be moved somewhere else
|
|
// later.
|
|
SDValue LROp, FPOp;
|
|
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
|
|
|
|
// Set up a copy of the stack pointer for use loading and storing any
|
|
// arguments that may not fit in the registers available for argument
|
|
// passing.
|
|
SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
|
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
|
|
SmallVector<SDValue, 8> MemOpChains;
|
|
|
|
bool seenFloatArg = false;
|
|
// Walk the register/memloc assignments, inserting copies/loads.
|
|
// i - Tracks the index into the list of registers allocated for the call
|
|
// RealArgIdx - Tracks the index into the list of actual function arguments
|
|
// j - Tracks the index into the list of byval arguments
|
|
for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
|
|
i != e;
|
|
++i, ++RealArgIdx) {
|
|
CCValAssign &VA = ArgLocs[i];
|
|
SDValue Arg = OutVals[RealArgIdx];
|
|
ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
|
|
|
|
if (Flags.isByVal()) {
|
|
// Argument is an aggregate which is passed by value, thus we need to
|
|
// create a copy of it in the local variable space of the current stack
|
|
// frame (which is the stack frame of the caller) and pass the address of
|
|
// this copy to the callee.
|
|
assert((j < ByValArgLocs.size()) && "Index out of bounds!");
|
|
CCValAssign &ByValVA = ByValArgLocs[j++];
|
|
assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
|
|
|
|
// Memory reserved in the local variable space of the callers stack frame.
|
|
unsigned LocMemOffset = ByValVA.getLocMemOffset();
|
|
|
|
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
|
|
StackPtr, PtrOff);
|
|
|
|
// Create a copy of the argument in the local area of the current
|
|
// stack frame.
|
|
SDValue MemcpyCall =
|
|
CreateCopyOfByValArgument(Arg, PtrOff,
|
|
CallSeqStart.getNode()->getOperand(0),
|
|
Flags, DAG, dl);
|
|
|
|
// This must go outside the CALLSEQ_START..END.
|
|
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
|
|
SDLoc(MemcpyCall));
|
|
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
|
|
NewCallSeqStart.getNode());
|
|
Chain = CallSeqStart = NewCallSeqStart;
|
|
|
|
// Pass the address of the aggregate copy on the stack either in a
|
|
// physical register or in the parameter list area of the current stack
|
|
// frame to the callee.
|
|
Arg = PtrOff;
|
|
}
|
|
|
|
// When useCRBits() is true, there can be i1 arguments.
|
|
// It is because getRegisterType(MVT::i1) => MVT::i1,
|
|
// and for other integer types getRegisterType() => MVT::i32.
|
|
// Extend i1 and ensure callee will get i32.
|
|
if (Arg.getValueType() == MVT::i1)
|
|
Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
|
|
dl, MVT::i32, Arg);
|
|
|
|
if (VA.isRegLoc()) {
|
|
seenFloatArg |= VA.getLocVT().isFloatingPoint();
|
|
// Put argument in a physical register.
|
|
if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
|
|
bool IsLE = Subtarget.isLittleEndian();
|
|
SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
|
|
DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
|
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
|
|
SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
|
|
DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
|
|
RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
|
|
SVal.getValue(0)));
|
|
} else
|
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
|
|
} else {
|
|
// Put argument in the parameter list area of the current stack frame.
|
|
assert(VA.isMemLoc());
|
|
unsigned LocMemOffset = VA.getLocMemOffset();
|
|
|
|
if (!isTailCall) {
|
|
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
|
|
StackPtr, PtrOff);
|
|
|
|
MemOpChains.push_back(
|
|
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
|
|
} else {
|
|
// Calculate and remember argument location.
|
|
CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
|
|
TailCallArguments);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!MemOpChains.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
SDValue InFlag;
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
|
RegsToPass[i].second, InFlag);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
// Set CR bit 6 to true if this is a vararg call with floating args passed in
|
|
// registers.
|
|
if (isVarArg) {
|
|
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
SDValue Ops[] = { Chain, InFlag };
|
|
|
|
Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
|
|
dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
|
|
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
if (isTailCall)
|
|
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
|
|
TailCallArguments);
|
|
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
|
|
/* unused except on PPC64 ELFv1 */ false, DAG,
|
|
RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
|
|
NumBytes, Ins, InVals, CS);
|
|
}
|
|
|
|
// Copy an argument into memory, being careful to do this outside the
|
|
// call sequence for the call to which the argument belongs.
|
|
SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
|
|
SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
|
|
SelectionDAG &DAG, const SDLoc &dl) const {
|
|
SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
|
|
CallSeqStart.getNode()->getOperand(0),
|
|
Flags, DAG, dl);
|
|
// The MEMCPY must go outside the CALLSEQ_START..END.
|
|
int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
|
|
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
|
|
SDLoc(MemcpyCall));
|
|
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
|
|
NewCallSeqStart.getNode());
|
|
return NewCallSeqStart;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerCall_64SVR4(
|
|
SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
|
|
bool isTailCall, bool isPatchPoint,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
|
|
ImmutableCallSite CS) const {
|
|
bool isELFv2ABI = Subtarget.isELFv2ABI();
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
unsigned NumOps = Outs.size();
|
|
bool hasNest = false;
|
|
bool IsSibCall = false;
|
|
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
unsigned PtrByteSize = 8;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
|
|
IsSibCall = true;
|
|
|
|
// Mark this function as potentially containing a function that contains a
|
|
// tail call. As a consequence the frame pointer will be used for dynamicalloc
|
|
// and restoring the callers stack pointer in this functions epilog. This is
|
|
// done because by tail calling the called function might overwrite the value
|
|
// in this function's (MF) stack pointer stack slot 0(SP).
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
CallConv == CallingConv::Fast)
|
|
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
|
|
|
|
assert(!(CallConv == CallingConv::Fast && isVarArg) &&
|
|
"fastcc not supported on varargs functions");
|
|
|
|
// Count how many bytes are to be pushed on the stack, including the linkage
|
|
// area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
|
|
// reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
|
|
// area is 32 bytes reserved space for [SP][CR][LR][TOC].
|
|
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
unsigned NumBytes = LinkageSize;
|
|
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
|
|
unsigned &QFPR_idx = FPR_idx;
|
|
|
|
static const MCPhysReg GPR[] = {
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
};
|
|
static const MCPhysReg VR[] = {
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
};
|
|
|
|
const unsigned NumGPRs = array_lengthof(GPR);
|
|
const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
|
|
const unsigned NumVRs = array_lengthof(VR);
|
|
const unsigned NumQFPRs = NumFPRs;
|
|
|
|
// On ELFv2, we can avoid allocating the parameter area if all the arguments
|
|
// can be passed to the callee in registers.
|
|
// For the fast calling convention, there is another check below.
|
|
// Note: We should keep consistent with LowerFormalArguments_64SVR4()
|
|
bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
|
|
if (!HasParameterArea) {
|
|
unsigned ParamAreaSize = NumGPRs * PtrByteSize;
|
|
unsigned AvailableFPRs = NumFPRs;
|
|
unsigned AvailableVRs = NumVRs;
|
|
unsigned NumBytesTmp = NumBytes;
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
if (Outs[i].Flags.isNest()) continue;
|
|
if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
|
|
PtrByteSize, LinkageSize, ParamAreaSize,
|
|
NumBytesTmp, AvailableFPRs, AvailableVRs,
|
|
Subtarget.hasQPX()))
|
|
HasParameterArea = true;
|
|
}
|
|
}
|
|
|
|
// When using the fast calling convention, we don't provide backing for
|
|
// arguments that will be in registers.
|
|
unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
|
|
|
|
// Avoid allocating parameter area for fastcc functions if all the arguments
|
|
// can be passed in the registers.
|
|
if (CallConv == CallingConv::Fast)
|
|
HasParameterArea = false;
|
|
|
|
// Add up all the space actually used.
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
EVT ArgVT = Outs[i].VT;
|
|
EVT OrigVT = Outs[i].ArgVT;
|
|
|
|
if (Flags.isNest())
|
|
continue;
|
|
|
|
if (CallConv == CallingConv::Fast) {
|
|
if (Flags.isByVal()) {
|
|
NumGPRsUsed += (Flags.getByValSize()+7)/8;
|
|
if (NumGPRsUsed > NumGPRs)
|
|
HasParameterArea = true;
|
|
} else {
|
|
switch (ArgVT.getSimpleVT().SimpleTy) {
|
|
default: llvm_unreachable("Unexpected ValueType for argument!");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
case MVT::i64:
|
|
if (++NumGPRsUsed <= NumGPRs)
|
|
continue;
|
|
break;
|
|
case MVT::v4i32:
|
|
case MVT::v8i16:
|
|
case MVT::v16i8:
|
|
case MVT::v2f64:
|
|
case MVT::v2i64:
|
|
case MVT::v1i128:
|
|
case MVT::f128:
|
|
if (++NumVRsUsed <= NumVRs)
|
|
continue;
|
|
break;
|
|
case MVT::v4f32:
|
|
// When using QPX, this is handled like a FP register, otherwise, it
|
|
// is an Altivec register.
|
|
if (Subtarget.hasQPX()) {
|
|
if (++NumFPRsUsed <= NumFPRs)
|
|
continue;
|
|
} else {
|
|
if (++NumVRsUsed <= NumVRs)
|
|
continue;
|
|
}
|
|
break;
|
|
case MVT::f32:
|
|
case MVT::f64:
|
|
case MVT::v4f64: // QPX
|
|
case MVT::v4i1: // QPX
|
|
if (++NumFPRsUsed <= NumFPRs)
|
|
continue;
|
|
break;
|
|
}
|
|
HasParameterArea = true;
|
|
}
|
|
}
|
|
|
|
/* Respect alignment of argument on the stack. */
|
|
unsigned Align =
|
|
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
|
|
NumBytes = ((NumBytes + Align - 1) / Align) * Align;
|
|
|
|
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
|
|
if (Flags.isInConsecutiveRegsLast())
|
|
NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
}
|
|
|
|
unsigned NumBytesActuallyUsed = NumBytes;
|
|
|
|
// In the old ELFv1 ABI,
|
|
// the prolog code of the callee may store up to 8 GPR argument registers to
|
|
// the stack, allowing va_start to index over them in memory if its varargs.
|
|
// Because we cannot tell if this is needed on the caller side, we have to
|
|
// conservatively assume that it is needed. As such, make sure we have at
|
|
// least enough stack space for the caller to store the 8 GPRs.
|
|
// In the ELFv2 ABI, we allocate the parameter area iff a callee
|
|
// really requires memory operands, e.g. a vararg function.
|
|
if (HasParameterArea)
|
|
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
|
|
else
|
|
NumBytes = LinkageSize;
|
|
|
|
// Tail call needs the stack to be aligned.
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
CallConv == CallingConv::Fast)
|
|
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
|
|
|
|
int SPDiff = 0;
|
|
|
|
// Calculate by how many bytes the stack has to be adjusted in case of tail
|
|
// call optimization.
|
|
if (!IsSibCall)
|
|
SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
|
|
|
|
// To protect arguments on the stack from being clobbered in a tail call,
|
|
// force all the loads to happen before doing any other lowering.
|
|
if (isTailCall)
|
|
Chain = DAG.getStackArgumentTokenFactor(Chain);
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
|
if (!IsSibCall)
|
|
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
|
|
SDValue CallSeqStart = Chain;
|
|
|
|
// Load the return address and frame pointer so it can be move somewhere else
|
|
// later.
|
|
SDValue LROp, FPOp;
|
|
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
|
|
|
|
// Set up a copy of the stack pointer for use loading and storing any
|
|
// arguments that may not fit in the registers available for argument
|
|
// passing.
|
|
SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
|
|
|
|
// Figure out which arguments are going to go in registers, and which in
|
|
// memory. Also, if this is a vararg function, floating point operations
|
|
// must be stored to our stack, and loaded into integer regs as well, if
|
|
// any integer regs are available for argument passing.
|
|
unsigned ArgOffset = LinkageSize;
|
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
|
|
|
|
SmallVector<SDValue, 8> MemOpChains;
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
SDValue Arg = OutVals[i];
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
EVT ArgVT = Outs[i].VT;
|
|
EVT OrigVT = Outs[i].ArgVT;
|
|
|
|
// PtrOff will be used to store the current argument to the stack if a
|
|
// register cannot be found for it.
|
|
SDValue PtrOff;
|
|
|
|
// We re-align the argument offset for each argument, except when using the
|
|
// fast calling convention, when we need to make sure we do that only when
|
|
// we'll actually use a stack slot.
|
|
auto ComputePtrOff = [&]() {
|
|
/* Respect alignment of argument on the stack. */
|
|
unsigned Align =
|
|
CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
|
|
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
|
|
|
|
PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
|
};
|
|
|
|
if (CallConv != CallingConv::Fast) {
|
|
ComputePtrOff();
|
|
|
|
/* Compute GPR index associated with argument offset. */
|
|
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
|
|
GPR_idx = std::min(GPR_idx, NumGPRs);
|
|
}
|
|
|
|
// Promote integers to 64-bit values.
|
|
if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
|
|
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
|
|
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
|
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
|
|
}
|
|
|
|
// FIXME memcpy is used way more than necessary. Correctness first.
|
|
// Note: "by value" is code for passing a structure by value, not
|
|
// basic types.
|
|
if (Flags.isByVal()) {
|
|
// Note: Size includes alignment padding, so
|
|
// struct x { short a; char b; }
|
|
// will have Size = 4. With #pragma pack(1), it will have Size = 3.
|
|
// These are the proper values we need for right-justifying the
|
|
// aggregate in a parameter register.
|
|
unsigned Size = Flags.getByValSize();
|
|
|
|
// An empty aggregate parameter takes up no storage and no
|
|
// registers.
|
|
if (Size == 0)
|
|
continue;
|
|
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputePtrOff();
|
|
|
|
// All aggregates smaller than 8 bytes must be passed right-justified.
|
|
if (Size==1 || Size==2 || Size==4) {
|
|
EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
|
|
if (GPR_idx != NumGPRs) {
|
|
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
|
|
MachinePointerInfo(), VT);
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
ArgOffset += PtrByteSize;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (GPR_idx == NumGPRs && Size < 8) {
|
|
SDValue AddPtr = PtrOff;
|
|
if (!isLittleEndian) {
|
|
SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
|
|
PtrOff.getValueType());
|
|
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
|
|
}
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
|
|
CallSeqStart,
|
|
Flags, DAG, dl);
|
|
ArgOffset += PtrByteSize;
|
|
continue;
|
|
}
|
|
// Copy entire object into memory. There are cases where gcc-generated
|
|
// code assumes it is there, even if it could be put entirely into
|
|
// registers. (This is not what the doc says.)
|
|
|
|
// FIXME: The above statement is likely due to a misunderstanding of the
|
|
// documents. All arguments must be copied into the parameter area BY
|
|
// THE CALLEE in the event that the callee takes the address of any
|
|
// formal argument. That has not yet been implemented. However, it is
|
|
// reasonable to use the stack area as a staging area for the register
|
|
// load.
|
|
|
|
// Skip this for small aggregates, as we will use the same slot for a
|
|
// right-justified copy, below.
|
|
if (Size >= 8)
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
|
|
CallSeqStart,
|
|
Flags, DAG, dl);
|
|
|
|
// When a register is available, pass a small aggregate right-justified.
|
|
if (Size < 8 && GPR_idx != NumGPRs) {
|
|
// The easiest way to get this right-justified in a register
|
|
// is to copy the structure into the rightmost portion of a
|
|
// local variable slot, then load the whole slot into the
|
|
// register.
|
|
// FIXME: The memcpy seems to produce pretty awful code for
|
|
// small aggregates, particularly for packed ones.
|
|
// FIXME: It would be preferable to use the slot in the
|
|
// parameter save area instead of a new local variable.
|
|
SDValue AddPtr = PtrOff;
|
|
if (!isLittleEndian) {
|
|
SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
|
|
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
|
|
}
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
|
|
CallSeqStart,
|
|
Flags, DAG, dl);
|
|
|
|
// Load the slot into the register.
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
// Done with this argument.
|
|
ArgOffset += PtrByteSize;
|
|
continue;
|
|
}
|
|
|
|
// For aggregates larger than PtrByteSize, copy the pieces of the
|
|
// object that fit into registers from the parameter save area.
|
|
for (unsigned j=0; j<Size; j+=PtrByteSize) {
|
|
SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
|
|
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
|
|
if (GPR_idx != NumGPRs) {
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
ArgOffset += PtrByteSize;
|
|
} else {
|
|
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
switch (Arg.getSimpleValueType().SimpleTy) {
|
|
default: llvm_unreachable("Unexpected ValueType for argument!");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
case MVT::i64:
|
|
if (Flags.isNest()) {
|
|
// The 'nest' parameter, if any, is passed in R11.
|
|
RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
|
|
hasNest = true;
|
|
break;
|
|
}
|
|
|
|
// These can be scalar arguments or elements of an integer array type
|
|
// passed directly. Clang may use those instead of "byval" aggregate
|
|
// types to avoid forcing arguments to memory unnecessarily.
|
|
if (GPR_idx != NumGPRs) {
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputePtrOff();
|
|
|
|
assert(HasParameterArea &&
|
|
"Parameter area must exist to pass an argument in memory.");
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
true, isTailCall, false, MemOpChains,
|
|
TailCallArguments, dl);
|
|
if (CallConv == CallingConv::Fast)
|
|
ArgOffset += PtrByteSize;
|
|
}
|
|
if (CallConv != CallingConv::Fast)
|
|
ArgOffset += PtrByteSize;
|
|
break;
|
|
case MVT::f32:
|
|
case MVT::f64: {
|
|
// These can be scalar arguments or elements of a float array type
|
|
// passed directly. The latter are used to implement ELFv2 homogenous
|
|
// float aggregates.
|
|
|
|
// Named arguments go into FPRs first, and once they overflow, the
|
|
// remaining arguments go into GPRs and then the parameter save area.
|
|
// Unnamed arguments for vararg functions always go to GPRs and
|
|
// then the parameter save area. For now, put all arguments to vararg
|
|
// routines always in both locations (FPR *and* GPR or stack slot).
|
|
bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
|
|
bool NeededLoad = false;
|
|
|
|
// First load the argument into the next available FPR.
|
|
if (FPR_idx != NumFPRs)
|
|
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
|
|
|
|
// Next, load the argument into GPR or stack slot if needed.
|
|
if (!NeedGPROrStack)
|
|
;
|
|
else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
|
|
// FIXME: We may want to re-enable this for CallingConv::Fast on the P8
|
|
// once we support fp <-> gpr moves.
|
|
|
|
// In the non-vararg case, this can only ever happen in the
|
|
// presence of f32 array types, since otherwise we never run
|
|
// out of FPRs before running out of GPRs.
|
|
SDValue ArgVal;
|
|
|
|
// Double values are always passed in a single GPR.
|
|
if (Arg.getValueType() != MVT::f32) {
|
|
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
|
|
|
|
// Non-array float values are extended and passed in a GPR.
|
|
} else if (!Flags.isInConsecutiveRegs()) {
|
|
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
|
|
ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
|
|
|
|
// If we have an array of floats, we collect every odd element
|
|
// together with its predecessor into one GPR.
|
|
} else if (ArgOffset % PtrByteSize != 0) {
|
|
SDValue Lo, Hi;
|
|
Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
|
|
Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
|
|
if (!isLittleEndian)
|
|
std::swap(Lo, Hi);
|
|
ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
|
|
|
|
// The final element, if even, goes into the first half of a GPR.
|
|
} else if (Flags.isInConsecutiveRegsLast()) {
|
|
ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
|
|
ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
|
|
if (!isLittleEndian)
|
|
ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
|
|
DAG.getConstant(32, dl, MVT::i32));
|
|
|
|
// Non-final even elements are skipped; they will be handled
|
|
// together the with subsequent argument on the next go-around.
|
|
} else
|
|
ArgVal = SDValue();
|
|
|
|
if (ArgVal.getNode())
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputePtrOff();
|
|
|
|
// Single-precision floating-point values are mapped to the
|
|
// second (rightmost) word of the stack doubleword.
|
|
if (Arg.getValueType() == MVT::f32 &&
|
|
!isLittleEndian && !Flags.isInConsecutiveRegs()) {
|
|
SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
|
|
}
|
|
|
|
assert(HasParameterArea &&
|
|
"Parameter area must exist to pass an argument in memory.");
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
true, isTailCall, false, MemOpChains,
|
|
TailCallArguments, dl);
|
|
|
|
NeededLoad = true;
|
|
}
|
|
// When passing an array of floats, the array occupies consecutive
|
|
// space in the argument area; only round up to the next doubleword
|
|
// at the end of the array. Otherwise, each float takes 8 bytes.
|
|
if (CallConv != CallingConv::Fast || NeededLoad) {
|
|
ArgOffset += (Arg.getValueType() == MVT::f32 &&
|
|
Flags.isInConsecutiveRegs()) ? 4 : 8;
|
|
if (Flags.isInConsecutiveRegsLast())
|
|
ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
}
|
|
break;
|
|
}
|
|
case MVT::v4f32:
|
|
case MVT::v4i32:
|
|
case MVT::v8i16:
|
|
case MVT::v16i8:
|
|
case MVT::v2f64:
|
|
case MVT::v2i64:
|
|
case MVT::v1i128:
|
|
case MVT::f128:
|
|
if (!Subtarget.hasQPX()) {
|
|
// These can be scalar arguments or elements of a vector array type
|
|
// passed directly. The latter are used to implement ELFv2 homogenous
|
|
// vector aggregates.
|
|
|
|
// For a varargs call, named arguments go into VRs or on the stack as
|
|
// usual; unnamed arguments always go to the stack or the corresponding
|
|
// GPRs when within range. For now, we always put the value in both
|
|
// locations (or even all three).
|
|
if (isVarArg) {
|
|
assert(HasParameterArea &&
|
|
"Parameter area must exist if we have a varargs call.");
|
|
// We could elide this store in the case where the object fits
|
|
// entirely in R registers. Maybe later.
|
|
SDValue Store =
|
|
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Store);
|
|
if (VR_idx != NumVRs) {
|
|
SDValue Load =
|
|
DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
|
|
}
|
|
ArgOffset += 16;
|
|
for (unsigned i=0; i<16; i+=PtrByteSize) {
|
|
if (GPR_idx == NumGPRs)
|
|
break;
|
|
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
|
|
DAG.getConstant(i, dl, PtrVT));
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Non-varargs Altivec params go into VRs or on the stack.
|
|
if (VR_idx != NumVRs) {
|
|
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputePtrOff();
|
|
|
|
assert(HasParameterArea &&
|
|
"Parameter area must exist to pass an argument in memory.");
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
true, isTailCall, true, MemOpChains,
|
|
TailCallArguments, dl);
|
|
if (CallConv == CallingConv::Fast)
|
|
ArgOffset += 16;
|
|
}
|
|
|
|
if (CallConv != CallingConv::Fast)
|
|
ArgOffset += 16;
|
|
break;
|
|
} // not QPX
|
|
|
|
assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
|
|
"Invalid QPX parameter type");
|
|
|
|
LLVM_FALLTHROUGH;
|
|
case MVT::v4f64:
|
|
case MVT::v4i1: {
|
|
bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
|
|
if (isVarArg) {
|
|
assert(HasParameterArea &&
|
|
"Parameter area must exist if we have a varargs call.");
|
|
// We could elide this store in the case where the object fits
|
|
// entirely in R registers. Maybe later.
|
|
SDValue Store =
|
|
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Store);
|
|
if (QFPR_idx != NumQFPRs) {
|
|
SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store,
|
|
PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load));
|
|
}
|
|
ArgOffset += (IsF32 ? 16 : 32);
|
|
for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) {
|
|
if (GPR_idx == NumGPRs)
|
|
break;
|
|
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
|
|
DAG.getConstant(i, dl, PtrVT));
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Non-varargs QPX params go into registers or on the stack.
|
|
if (QFPR_idx != NumQFPRs) {
|
|
RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
|
|
} else {
|
|
if (CallConv == CallingConv::Fast)
|
|
ComputePtrOff();
|
|
|
|
assert(HasParameterArea &&
|
|
"Parameter area must exist to pass an argument in memory.");
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
true, isTailCall, true, MemOpChains,
|
|
TailCallArguments, dl);
|
|
if (CallConv == CallingConv::Fast)
|
|
ArgOffset += (IsF32 ? 16 : 32);
|
|
}
|
|
|
|
if (CallConv != CallingConv::Fast)
|
|
ArgOffset += (IsF32 ? 16 : 32);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
|
|
"mismatch in size of parameter area");
|
|
(void)NumBytesActuallyUsed;
|
|
|
|
if (!MemOpChains.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
|
|
|
// Check if this is an indirect call (MTCTR/BCTRL).
|
|
// See prepareDescriptorIndirectCall and buildCallOperands for more
|
|
// information about calls through function pointers in the 64-bit SVR4 ABI.
|
|
if (!isTailCall && !isPatchPoint &&
|
|
!isFunctionGlobalAddress(Callee) &&
|
|
!isa<ExternalSymbolSDNode>(Callee)) {
|
|
// Load r2 into a virtual register and store it to the TOC save area.
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
|
|
// TOC save area offset.
|
|
unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
|
|
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
|
Chain = DAG.getStore(
|
|
Val.getValue(1), dl, Val, AddPtr,
|
|
MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
|
|
// In the ELFv2 ABI, R12 must contain the address of an indirect callee.
|
|
// This does not mean the MTCTR instruction must use R12; it's easier
|
|
// to model this as an extra parameter, so do that.
|
|
if (isELFv2ABI && !isPatchPoint)
|
|
RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
|
|
}
|
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
SDValue InFlag;
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
|
RegsToPass[i].second, InFlag);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
if (isTailCall && !IsSibCall)
|
|
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
|
|
TailCallArguments);
|
|
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
|
|
DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
|
|
SPDiff, NumBytes, Ins, InVals, CS);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerCall_Darwin(
|
|
SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
|
|
bool isTailCall, bool isPatchPoint,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
|
|
ImmutableCallSite CS) const {
|
|
unsigned NumOps = Outs.size();
|
|
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
bool isPPC64 = PtrVT == MVT::i64;
|
|
unsigned PtrByteSize = isPPC64 ? 8 : 4;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
// Mark this function as potentially containing a function that contains a
|
|
// tail call. As a consequence the frame pointer will be used for dynamicalloc
|
|
// and restoring the callers stack pointer in this functions epilog. This is
|
|
// done because by tail calling the called function might overwrite the value
|
|
// in this function's (MF) stack pointer stack slot 0(SP).
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
CallConv == CallingConv::Fast)
|
|
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
|
|
|
|
// Count how many bytes are to be pushed on the stack, including the linkage
|
|
// area, and parameter passing area. We start with 24/48 bytes, which is
|
|
// prereserved space for [SP][CR][LR][3 x unused].
|
|
unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
unsigned NumBytes = LinkageSize;
|
|
|
|
// Add up all the space actually used.
|
|
// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
|
|
// they all go in registers, but we must reserve stack space for them for
|
|
// possible use by the caller. In varargs or 64-bit calls, parameters are
|
|
// assigned stack space in order, with padding so Altivec parameters are
|
|
// 16-byte aligned.
|
|
unsigned nAltivecParamsAtEnd = 0;
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
EVT ArgVT = Outs[i].VT;
|
|
// Varargs Altivec parameters are padded to a 16 byte boundary.
|
|
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
|
|
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
|
|
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
|
|
if (!isVarArg && !isPPC64) {
|
|
// Non-varargs Altivec parameters go after all the non-Altivec
|
|
// parameters; handle those later so we know how much padding we need.
|
|
nAltivecParamsAtEnd++;
|
|
continue;
|
|
}
|
|
// Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
|
|
NumBytes = ((NumBytes+15)/16)*16;
|
|
}
|
|
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
|
|
}
|
|
|
|
// Allow for Altivec parameters at the end, if needed.
|
|
if (nAltivecParamsAtEnd) {
|
|
NumBytes = ((NumBytes+15)/16)*16;
|
|
NumBytes += 16*nAltivecParamsAtEnd;
|
|
}
|
|
|
|
// The prolog code of the callee may store up to 8 GPR argument registers to
|
|
// the stack, allowing va_start to index over them in memory if its varargs.
|
|
// Because we cannot tell if this is needed on the caller side, we have to
|
|
// conservatively assume that it is needed. As such, make sure we have at
|
|
// least enough stack space for the caller to store the 8 GPRs.
|
|
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
|
|
|
|
// Tail call needs the stack to be aligned.
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
CallConv == CallingConv::Fast)
|
|
NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
|
|
|
|
// Calculate by how many bytes the stack has to be adjusted in case of tail
|
|
// call optimization.
|
|
int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
|
|
|
|
// To protect arguments on the stack from being clobbered in a tail call,
|
|
// force all the loads to happen before doing any other lowering.
|
|
if (isTailCall)
|
|
Chain = DAG.getStackArgumentTokenFactor(Chain);
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
|
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
|
|
SDValue CallSeqStart = Chain;
|
|
|
|
// Load the return address and frame pointer so it can be move somewhere else
|
|
// later.
|
|
SDValue LROp, FPOp;
|
|
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
|
|
|
|
// Set up a copy of the stack pointer for use loading and storing any
|
|
// arguments that may not fit in the registers available for argument
|
|
// passing.
|
|
SDValue StackPtr;
|
|
if (isPPC64)
|
|
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
|
|
else
|
|
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
|
|
|
|
// Figure out which arguments are going to go in registers, and which in
|
|
// memory. Also, if this is a vararg function, floating point operations
|
|
// must be stored to our stack, and loaded into integer regs as well, if
|
|
// any integer regs are available for argument passing.
|
|
unsigned ArgOffset = LinkageSize;
|
|
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
|
|
|
|
static const MCPhysReg GPR_32[] = { // 32-bit registers.
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
};
|
|
static const MCPhysReg GPR_64[] = { // 64-bit registers.
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
};
|
|
static const MCPhysReg VR[] = {
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
};
|
|
const unsigned NumGPRs = array_lengthof(GPR_32);
|
|
const unsigned NumFPRs = 13;
|
|
const unsigned NumVRs = array_lengthof(VR);
|
|
|
|
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
|
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
|
|
|
|
SmallVector<SDValue, 8> MemOpChains;
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
SDValue Arg = OutVals[i];
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
|
|
// PtrOff will be used to store the current argument to the stack if a
|
|
// register cannot be found for it.
|
|
SDValue PtrOff;
|
|
|
|
PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
|
|
|
// On PPC64, promote integers to 64-bit values.
|
|
if (isPPC64 && Arg.getValueType() == MVT::i32) {
|
|
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
|
|
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
|
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
|
|
}
|
|
|
|
// FIXME memcpy is used way more than necessary. Correctness first.
|
|
// Note: "by value" is code for passing a structure by value, not
|
|
// basic types.
|
|
if (Flags.isByVal()) {
|
|
unsigned Size = Flags.getByValSize();
|
|
// Very small objects are passed right-justified. Everything else is
|
|
// passed left-justified.
|
|
if (Size==1 || Size==2) {
|
|
EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
|
|
if (GPR_idx != NumGPRs) {
|
|
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
|
|
MachinePointerInfo(), VT);
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
ArgOffset += PtrByteSize;
|
|
} else {
|
|
SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
|
|
PtrOff.getValueType());
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
|
|
CallSeqStart,
|
|
Flags, DAG, dl);
|
|
ArgOffset += PtrByteSize;
|
|
}
|
|
continue;
|
|
}
|
|
// Copy entire object into memory. There are cases where gcc-generated
|
|
// code assumes it is there, even if it could be put entirely into
|
|
// registers. (This is not what the doc says.)
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
|
|
CallSeqStart,
|
|
Flags, DAG, dl);
|
|
|
|
// For small aggregates (Darwin only) and aggregates >= PtrByteSize,
|
|
// copy the pieces of the object that fit into registers from the
|
|
// parameter save area.
|
|
for (unsigned j=0; j<Size; j+=PtrByteSize) {
|
|
SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
|
|
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
|
|
if (GPR_idx != NumGPRs) {
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
ArgOffset += PtrByteSize;
|
|
} else {
|
|
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
|
|
break;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
|
|
switch (Arg.getSimpleValueType().SimpleTy) {
|
|
default: llvm_unreachable("Unexpected ValueType for argument!");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
case MVT::i64:
|
|
if (GPR_idx != NumGPRs) {
|
|
if (Arg.getValueType() == MVT::i1)
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
|
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
|
|
} else {
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
isPPC64, isTailCall, false, MemOpChains,
|
|
TailCallArguments, dl);
|
|
}
|
|
ArgOffset += PtrByteSize;
|
|
break;
|
|
case MVT::f32:
|
|
case MVT::f64:
|
|
if (FPR_idx != NumFPRs) {
|
|
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
|
|
|
|
if (isVarArg) {
|
|
SDValue Store =
|
|
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Store);
|
|
|
|
// Float varargs are always shadowed in available integer registers
|
|
if (GPR_idx != NumGPRs) {
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
}
|
|
if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
|
|
SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
}
|
|
} else {
|
|
// If we have any FPRs remaining, we may also have GPRs remaining.
|
|
// Args passed in FPRs consume either 1 (f32) or 2 (f64) available
|
|
// GPRs.
|
|
if (GPR_idx != NumGPRs)
|
|
++GPR_idx;
|
|
if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
|
|
!isPPC64) // PPC64 has 64-bit GPR's obviously :)
|
|
++GPR_idx;
|
|
}
|
|
} else
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
isPPC64, isTailCall, false, MemOpChains,
|
|
TailCallArguments, dl);
|
|
if (isPPC64)
|
|
ArgOffset += 8;
|
|
else
|
|
ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
|
|
break;
|
|
case MVT::v4f32:
|
|
case MVT::v4i32:
|
|
case MVT::v8i16:
|
|
case MVT::v16i8:
|
|
if (isVarArg) {
|
|
// These go aligned on the stack, or in the corresponding R registers
|
|
// when within range. The Darwin PPC ABI doc claims they also go in
|
|
// V registers; in fact gcc does this only for arguments that are
|
|
// prototyped, not for those that match the ... We do it for all
|
|
// arguments, seems to work.
|
|
while (ArgOffset % 16 !=0) {
|
|
ArgOffset += PtrByteSize;
|
|
if (GPR_idx != NumGPRs)
|
|
GPR_idx++;
|
|
}
|
|
// We could elide this store in the case where the object fits
|
|
// entirely in R registers. Maybe later.
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
|
|
DAG.getConstant(ArgOffset, dl, PtrVT));
|
|
SDValue Store =
|
|
DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Store);
|
|
if (VR_idx != NumVRs) {
|
|
SDValue Load =
|
|
DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
|
|
}
|
|
ArgOffset += 16;
|
|
for (unsigned i=0; i<16; i+=PtrByteSize) {
|
|
if (GPR_idx == NumGPRs)
|
|
break;
|
|
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
|
|
DAG.getConstant(i, dl, PtrVT));
|
|
SDValue Load =
|
|
DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Non-varargs Altivec params generally go in registers, but have
|
|
// stack space allocated at the end.
|
|
if (VR_idx != NumVRs) {
|
|
// Doesn't have GPR space allocated.
|
|
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
|
|
} else if (nAltivecParamsAtEnd==0) {
|
|
// We are emitting Altivec params in order.
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
isPPC64, isTailCall, true, MemOpChains,
|
|
TailCallArguments, dl);
|
|
ArgOffset += 16;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
// If all Altivec parameters fit in registers, as they usually do,
|
|
// they get stack space following the non-Altivec parameters. We
|
|
// don't track this here because nobody below needs it.
|
|
// If there are more Altivec parameters than fit in registers emit
|
|
// the stores here.
|
|
if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
|
|
unsigned j = 0;
|
|
// Offset is aligned; skip 1st 12 params which go in V registers.
|
|
ArgOffset = ((ArgOffset+15)/16)*16;
|
|
ArgOffset += 12*16;
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
SDValue Arg = OutVals[i];
|
|
EVT ArgType = Outs[i].VT;
|
|
if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
|
|
ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
|
|
if (++j > NumVRs) {
|
|
SDValue PtrOff;
|
|
// We are emitting Altivec params in order.
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
isPPC64, isTailCall, true, MemOpChains,
|
|
TailCallArguments, dl);
|
|
ArgOffset += 16;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!MemOpChains.empty())
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
|
|
|
// On Darwin, R12 must contain the address of an indirect callee. This does
|
|
// not mean the MTCTR instruction must use R12; it's easier to model this as
|
|
// an extra parameter, so do that.
|
|
if (!isTailCall &&
|
|
!isFunctionGlobalAddress(Callee) &&
|
|
!isa<ExternalSymbolSDNode>(Callee) &&
|
|
!isBLACompatibleAddress(Callee, DAG))
|
|
RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
|
|
PPC::R12), Callee));
|
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
SDValue InFlag;
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
|
RegsToPass[i].second, InFlag);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
if (isTailCall)
|
|
PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
|
|
TailCallArguments);
|
|
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
|
|
/* unused except on PPC64 ELFv1 */ false, DAG,
|
|
RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
|
|
NumBytes, Ins, InVals, CS);
|
|
}
|
|
|
|
static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
|
|
CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
|
|
CCState &State) {
|
|
|
|
if (ValVT == MVT::f128)
|
|
report_fatal_error("f128 is unimplemented on AIX.");
|
|
|
|
if (ArgFlags.isByVal())
|
|
report_fatal_error("Passing structure by value is unimplemented.");
|
|
|
|
if (ArgFlags.isNest())
|
|
report_fatal_error("Nest arguments are unimplemented.");
|
|
|
|
if (ValVT.isVector() || LocVT.isVector())
|
|
report_fatal_error("Vector arguments are unimplemented on AIX.");
|
|
|
|
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
|
|
State.getMachineFunction().getSubtarget());
|
|
const bool IsPPC64 = Subtarget.isPPC64();
|
|
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
|
|
|
|
static const MCPhysReg GPR_32[] = {// 32-bit registers.
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10};
|
|
static const MCPhysReg GPR_64[] = {// 64-bit registers.
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10};
|
|
|
|
// Arguments always reserve parameter save area.
|
|
switch (ValVT.SimpleTy) {
|
|
default:
|
|
report_fatal_error("Unhandled value type for argument.");
|
|
case MVT::i64:
|
|
// i64 arguments should have been split to i32 for PPC32.
|
|
assert(IsPPC64 && "PPC32 should have split i64 values.");
|
|
LLVM_FALLTHROUGH;
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
State.AllocateStack(PtrByteSize, PtrByteSize);
|
|
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
|
|
MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
|
|
// Promote integers if needed.
|
|
if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
|
|
LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
|
|
: CCValAssign::LocInfo::ZExt;
|
|
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
|
|
}
|
|
else
|
|
report_fatal_error("Handling of placing parameters on the stack is "
|
|
"unimplemented!");
|
|
return false;
|
|
|
|
case MVT::f32:
|
|
case MVT::f64: {
|
|
// Parameter save area (PSA) is reserved even if the float passes in fpr.
|
|
const unsigned StoreSize = LocVT.getStoreSize();
|
|
// Floats are always 4-byte aligned in the PSA on AIX.
|
|
// This includes f64 in 64-bit mode for ABI compatibility.
|
|
State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);
|
|
if (unsigned Reg = State.AllocateReg(FPR))
|
|
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
|
|
else
|
|
report_fatal_error("Handling of placing parameters on the stack is "
|
|
"unimplemented!");
|
|
|
|
// AIX requires that GPRs are reserved for float arguments.
|
|
// Successfully reserved GPRs are only initialized for vararg calls.
|
|
MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
|
|
for (unsigned I = 0; I < StoreSize; I += PtrByteSize) {
|
|
if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
|
|
if (State.isVarArg()) {
|
|
// Custom handling is required for:
|
|
// f64 in PPC32 needs to be split into 2 GPRs.
|
|
// f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
|
|
State.addLoc(
|
|
CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
|
|
}
|
|
} else if (State.isVarArg()) {
|
|
report_fatal_error("Handling of placing parameters on the stack is "
|
|
"unimplemented!");
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
|
|
bool IsPPC64) {
|
|
assert((IsPPC64 || SVT != MVT::i64) &&
|
|
"i64 should have been split for 32-bit codegen.");
|
|
|
|
switch (SVT) {
|
|
default:
|
|
report_fatal_error("Unexpected value type for formal argument");
|
|
case MVT::i1:
|
|
case MVT::i32:
|
|
case MVT::i64:
|
|
return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
|
|
case MVT::f32:
|
|
return &PPC::F4RCRegClass;
|
|
case MVT::f64:
|
|
return &PPC::F8RCRegClass;
|
|
}
|
|
}
|
|
|
|
static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
|
|
SelectionDAG &DAG, SDValue ArgValue,
|
|
MVT LocVT, const SDLoc &dl) {
|
|
assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
|
|
assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());
|
|
|
|
if (Flags.isSExt())
|
|
ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
|
|
DAG.getValueType(ValVT));
|
|
else if (Flags.isZExt())
|
|
ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
|
|
DAG.getValueType(ValVT));
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFormalArguments_AIX(
|
|
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
|
|
CallConv == CallingConv::Fast) &&
|
|
"Unexpected calling convention!");
|
|
|
|
if (isVarArg)
|
|
report_fatal_error("This call type is unimplemented on AIX.");
|
|
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt)
|
|
report_fatal_error("Tail call support is unimplemented on AIX.");
|
|
|
|
if (useSoftFloat())
|
|
report_fatal_error("Soft float support is unimplemented on AIX.");
|
|
|
|
const PPCSubtarget &Subtarget =
|
|
static_cast<const PPCSubtarget &>(DAG.getSubtarget());
|
|
if (Subtarget.hasQPX())
|
|
report_fatal_error("QPX support is not supported on AIX.");
|
|
|
|
const bool IsPPC64 = Subtarget.isPPC64();
|
|
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
|
|
|
|
// Assign locations to all of the incoming arguments.
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
|
|
|
|
// Reserve space for the linkage area on the stack.
|
|
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
// On AIX a minimum of 8 words is saved to the parameter save area.
|
|
const unsigned MinParameterSaveArea = 8 * PtrByteSize;
|
|
CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);
|
|
CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
|
|
|
|
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
|
|
CCValAssign &VA = ArgLocs[i];
|
|
SDValue ArgValue;
|
|
ISD::ArgFlagsTy Flags = Ins[i].Flags;
|
|
if (VA.isRegLoc()) {
|
|
EVT ValVT = VA.getValVT();
|
|
MVT LocVT = VA.getLocVT();
|
|
MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
|
|
unsigned VReg =
|
|
MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
|
|
ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
|
|
if (ValVT.isScalarInteger() &&
|
|
(ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
|
|
ArgValue =
|
|
truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
|
|
}
|
|
InVals.push_back(ArgValue);
|
|
} else {
|
|
report_fatal_error("Handling of formal arguments on the stack is "
|
|
"unimplemented!");
|
|
}
|
|
}
|
|
|
|
// Area that is at least reserved in the caller of this function.
|
|
unsigned MinReservedArea = CCInfo.getNextStackOffset();
|
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
|
// call optimized function's reserved stack space needs to be aligned so
|
|
// that taking the difference between two stack areas will result in an
|
|
// aligned stack.
|
|
MinReservedArea =
|
|
EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
|
|
|
return Chain;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerCall_AIX(
|
|
SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
|
|
bool isTailCall, bool isPatchPoint,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
|
|
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
|
|
ImmutableCallSite CS) const {
|
|
|
|
assert((CallConv == CallingConv::C ||
|
|
CallConv == CallingConv::Cold ||
|
|
CallConv == CallingConv::Fast) && "Unexpected calling convention!");
|
|
|
|
if (isPatchPoint)
|
|
report_fatal_error("This call type is unimplemented on AIX.");
|
|
|
|
const PPCSubtarget& Subtarget =
|
|
static_cast<const PPCSubtarget&>(DAG.getSubtarget());
|
|
if (Subtarget.hasQPX())
|
|
report_fatal_error("QPX is not supported on AIX.");
|
|
if (Subtarget.hasAltivec())
|
|
report_fatal_error("Altivec support is unimplemented on AIX.");
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
|
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
|
|
|
|
// Reserve space for the linkage save area (LSA) on the stack.
|
|
// In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
|
|
// [SP][CR][LR][2 x reserved][TOC].
|
|
// The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
|
|
const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
|
|
const bool IsPPC64 = Subtarget.isPPC64();
|
|
const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
|
|
CCInfo.AllocateStack(LinkageSize, PtrByteSize);
|
|
CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
|
|
|
|
// The prolog code of the callee may store up to 8 GPR argument registers to
|
|
// the stack, allowing va_start to index over them in memory if the callee
|
|
// is variadic.
|
|
// Because we cannot tell if this is needed on the caller side, we have to
|
|
// conservatively assume that it is needed. As such, make sure we have at
|
|
// least enough stack space for the caller to store the 8 GPRs.
|
|
const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
|
|
const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize;
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
// These operations are automatically eliminated by the prolog/epilog pass.
|
|
Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
|
|
SDValue CallSeqStart = Chain;
|
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
|
|
for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
|
|
CCValAssign &VA = ArgLocs[I++];
|
|
|
|
if (VA.isMemLoc())
|
|
report_fatal_error("Handling of placing parameters on the stack is "
|
|
"unimplemented!");
|
|
if (!VA.isRegLoc())
|
|
report_fatal_error(
|
|
"Unexpected non-register location for function call argument.");
|
|
|
|
SDValue Arg = OutVals[VA.getValNo()];
|
|
|
|
if (!VA.needsCustom()) {
|
|
switch (VA.getLocInfo()) {
|
|
default:
|
|
report_fatal_error("Unexpected argument extension type.");
|
|
case CCValAssign::Full:
|
|
break;
|
|
case CCValAssign::ZExt:
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
|
|
break;
|
|
case CCValAssign::SExt:
|
|
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
|
|
break;
|
|
}
|
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
|
|
|
|
continue;
|
|
}
|
|
|
|
// Custom handling is used for GPR initializations for vararg float
|
|
// arguments.
|
|
assert(isVarArg && VA.getValVT().isFloatingPoint() &&
|
|
VA.getLocVT().isInteger() &&
|
|
"Unexpected custom register handling for calling convention.");
|
|
|
|
SDValue ArgAsInt =
|
|
DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg);
|
|
|
|
if (Arg.getValueType().getStoreSize() == VA.getLocVT().getStoreSize())
|
|
// f32 in 32-bit GPR
|
|
// f64 in 64-bit GPR
|
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
|
|
else if (Arg.getValueType().getSizeInBits() < VA.getLocVT().getSizeInBits())
|
|
// f32 in 64-bit GPR.
|
|
RegsToPass.push_back(std::make_pair(
|
|
VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, VA.getLocVT())));
|
|
else {
|
|
// f64 in two 32-bit GPRs
|
|
// The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
|
|
assert(Arg.getValueType() == MVT::f64 && isVarArg && !IsPPC64 &&
|
|
"Unexpected custom register for argument!");
|
|
CCValAssign &GPR1 = VA;
|
|
SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
|
|
DAG.getConstant(32, dl, MVT::i8));
|
|
RegsToPass.push_back(std::make_pair(
|
|
GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
|
|
assert(I != E && "A second custom GPR is expected!");
|
|
CCValAssign &GPR2 = ArgLocs[I++];
|
|
assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() &&
|
|
GPR2.needsCustom() && "A second custom GPR is expected!");
|
|
RegsToPass.push_back(std::make_pair(
|
|
GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
|
|
}
|
|
}
|
|
|
|
// For indirect calls, we need to save the TOC base to the stack for
|
|
// restoration after the call.
|
|
if (!isTailCall && !isPatchPoint &&
|
|
!isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) {
|
|
const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
|
|
const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
|
|
const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
|
|
const unsigned TOCSaveOffset =
|
|
Subtarget.getFrameLowering()->getTOCSaveOffset();
|
|
|
|
setUsesTOCBasePtr(DAG);
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);
|
|
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
|
|
SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
|
Chain = DAG.getStore(
|
|
Val.getValue(1), dl, Val, AddPtr,
|
|
MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
|
|
}
|
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
SDValue InFlag;
|
|
for (auto Reg : RegsToPass) {
|
|
Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag);
|
|
InFlag = Chain.getValue(1);
|
|
}
|
|
|
|
const int SPDiff = 0;
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
|
|
/* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass,
|
|
InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins,
|
|
InVals, CS);
|
|
}
|
|
|
|
bool
|
|
PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
|
|
MachineFunction &MF, bool isVarArg,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
LLVMContext &Context) const {
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
|
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
|
|
return CCInfo.CheckReturn(
|
|
Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
|
|
? RetCC_PPC_Cold
|
|
: RetCC_PPC);
|
|
}
|
|
|
|
SDValue
|
|
PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
|
bool isVarArg,
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
const SDLoc &dl, SelectionDAG &DAG) const {
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
|
*DAG.getContext());
|
|
CCInfo.AnalyzeReturn(Outs,
|
|
(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
|
|
? RetCC_PPC_Cold
|
|
: RetCC_PPC);
|
|
|
|
SDValue Flag;
|
|
SmallVector<SDValue, 4> RetOps(1, Chain);
|
|
|
|
// Copy the result values into the output registers.
|
|
for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
|
|
CCValAssign &VA = RVLocs[i];
|
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
|
|
|
SDValue Arg = OutVals[RealResIdx];
|
|
|
|
switch (VA.getLocInfo()) {
|
|
default: llvm_unreachable("Unknown loc info!");
|
|
case CCValAssign::Full: break;
|
|
case CCValAssign::AExt:
|
|
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
|
|
break;
|
|
case CCValAssign::ZExt:
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
|
|
break;
|
|
case CCValAssign::SExt:
|
|
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
|
|
break;
|
|
}
|
|
if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
// Legalize ret f64 -> ret 2 x i32.
|
|
SDValue SVal =
|
|
DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
|
|
DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
|
|
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
|
|
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
|
SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
|
|
DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
|
|
Flag = Chain.getValue(1);
|
|
VA = RVLocs[++i]; // skip ahead to next loc
|
|
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag);
|
|
} else
|
|
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
|
|
Flag = Chain.getValue(1);
|
|
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
|
}
|
|
|
|
const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
|
|
const MCPhysReg *I =
|
|
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
|
|
if (I) {
|
|
for (; *I; ++I) {
|
|
|
|
if (PPC::G8RCRegClass.contains(*I))
|
|
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
|
|
else if (PPC::F8RCRegClass.contains(*I))
|
|
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
|
|
else if (PPC::CRRCRegClass.contains(*I))
|
|
RetOps.push_back(DAG.getRegister(*I, MVT::i1));
|
|
else if (PPC::VRRCRegClass.contains(*I))
|
|
RetOps.push_back(DAG.getRegister(*I, MVT::Other));
|
|
else
|
|
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
|
}
|
|
}
|
|
|
|
RetOps[0] = Chain; // Update chain.
|
|
|
|
// Add the flag if we have it.
|
|
if (Flag.getNode())
|
|
RetOps.push_back(Flag);
|
|
|
|
return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
|
|
}
|
|
|
|
SDValue
|
|
PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
|
|
// Get the correct type for integers.
|
|
EVT IntVT = Op.getValueType();
|
|
|
|
// Get the inputs.
|
|
SDValue Chain = Op.getOperand(0);
|
|
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
|
|
// Build a DYNAREAOFFSET node.
|
|
SDValue Ops[2] = {Chain, FPSIdx};
|
|
SDVTList VTs = DAG.getVTList(IntVT);
|
|
return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
// When we pop the dynamic allocation we need to restore the SP link.
|
|
SDLoc dl(Op);
|
|
|
|
// Get the correct type for pointers.
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
|
|
// Construct the stack pointer operand.
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
|
|
SDValue StackPtr = DAG.getRegister(SP, PtrVT);
|
|
|
|
// Get the operands for the STACKRESTORE.
|
|
SDValue Chain = Op.getOperand(0);
|
|
SDValue SaveSP = Op.getOperand(1);
|
|
|
|
// Load the old link SP.
|
|
SDValue LoadLinkSP =
|
|
DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
|
|
|
|
// Restore the stack pointer.
|
|
Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
|
|
|
|
// Store the old link SP.
|
|
return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
|
|
}
|
|
|
|
SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
|
|
// Get current frame pointer save index. The users of this index will be
|
|
// primarily DYNALLOC instructions.
|
|
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
|
|
int RASI = FI->getReturnAddrSaveIndex();
|
|
|
|
// If the frame pointer save index hasn't been defined yet.
|
|
if (!RASI) {
|
|
// Find out what the fix offset of the frame pointer save area.
|
|
int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
|
|
// Allocate the frame index for frame pointer save area.
|
|
RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
|
|
// Save the result.
|
|
FI->setReturnAddrSaveIndex(RASI);
|
|
}
|
|
return DAG.getFrameIndex(RASI, PtrVT);
|
|
}
|
|
|
|
SDValue
|
|
PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
|
|
// Get current frame pointer save index. The users of this index will be
|
|
// primarily DYNALLOC instructions.
|
|
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
|
|
int FPSI = FI->getFramePointerSaveIndex();
|
|
|
|
// If the frame pointer save index hasn't been defined yet.
|
|
if (!FPSI) {
|
|
// Find out what the fix offset of the frame pointer save area.
|
|
int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
|
|
// Allocate the frame index for frame pointer save area.
|
|
FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
|
|
// Save the result.
|
|
FI->setFramePointerSaveIndex(FPSI);
|
|
}
|
|
return DAG.getFrameIndex(FPSI, PtrVT);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
// Get the inputs.
|
|
SDValue Chain = Op.getOperand(0);
|
|
SDValue Size = Op.getOperand(1);
|
|
SDLoc dl(Op);
|
|
|
|
// Get the correct type for pointers.
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
// Negate the size.
|
|
SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
|
|
DAG.getConstant(0, dl, PtrVT), Size);
|
|
// Construct a node for the frame pointer save index.
|
|
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
|
|
// Build a DYNALLOC node.
|
|
SDValue Ops[3] = { Chain, NegSize, FPSIdx };
|
|
SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
|
|
return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
|
|
int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
|
|
return DAG.getFrameIndex(FI, PtrVT);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc DL(Op);
|
|
return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
|
|
DAG.getVTList(MVT::i32, MVT::Other),
|
|
Op.getOperand(0), Op.getOperand(1));
|
|
}
|
|
|
|
SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc DL(Op);
|
|
return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
|
|
Op.getOperand(0), Op.getOperand(1));
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|
if (Op.getValueType().isVector())
|
|
return LowerVectorLoad(Op, DAG);
|
|
|
|
assert(Op.getValueType() == MVT::i1 &&
|
|
"Custom lowering only for i1 loads");
|
|
|
|
// First, load 8 bits into 32 bits, then truncate to 1 bit.
|
|
|
|
SDLoc dl(Op);
|
|
LoadSDNode *LD = cast<LoadSDNode>(Op);
|
|
|
|
SDValue Chain = LD->getChain();
|
|
SDValue BasePtr = LD->getBasePtr();
|
|
MachineMemOperand *MMO = LD->getMemOperand();
|
|
|
|
SDValue NewLD =
|
|
DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
|
|
BasePtr, MVT::i8, MMO);
|
|
SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
|
|
|
|
SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
|
|
return DAG.getMergeValues(Ops, dl);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|
if (Op.getOperand(1).getValueType().isVector())
|
|
return LowerVectorStore(Op, DAG);
|
|
|
|
assert(Op.getOperand(1).getValueType() == MVT::i1 &&
|
|
"Custom lowering only for i1 stores");
|
|
|
|
// First, zero extend to 32 bits, then use a truncating store to 8 bits.
|
|
|
|
SDLoc dl(Op);
|
|
StoreSDNode *ST = cast<StoreSDNode>(Op);
|
|
|
|
SDValue Chain = ST->getChain();
|
|
SDValue BasePtr = ST->getBasePtr();
|
|
SDValue Value = ST->getValue();
|
|
MachineMemOperand *MMO = ST->getMemOperand();
|
|
|
|
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
|
|
Value);
|
|
return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
|
|
}
|
|
|
|
// FIXME: Remove this once the ANDI glue bug is fixed:
|
|
SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
|
|
assert(Op.getValueType() == MVT::i1 &&
|
|
"Custom lowering only for i1 results");
|
|
|
|
SDLoc DL(Op);
|
|
return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
|
|
// Implements a vector truncate that fits in a vector register as a shuffle.
|
|
// We want to legalize vector truncates down to where the source fits in
|
|
// a vector register (and target is therefore smaller than vector register
|
|
// size). At that point legalization will try to custom lower the sub-legal
|
|
// result and get here - where we can contain the truncate as a single target
|
|
// operation.
|
|
|
|
// For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
|
|
// <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
|
|
//
|
|
// We will implement it for big-endian ordering as this (where x denotes
|
|
// undefined):
|
|
// < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
|
|
// < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
|
|
//
|
|
// The same operation in little-endian ordering will be:
|
|
// <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
|
|
// <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
|
|
|
|
assert(Op.getValueType().isVector() && "Vector type expected.");
|
|
|
|
SDLoc DL(Op);
|
|
SDValue N1 = Op.getOperand(0);
|
|
unsigned SrcSize = N1.getValueType().getSizeInBits();
|
|
assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector");
|
|
SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
|
|
|
|
EVT TrgVT = Op.getValueType();
|
|
unsigned TrgNumElts = TrgVT.getVectorNumElements();
|
|
EVT EltVT = TrgVT.getVectorElementType();
|
|
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
|
|
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
|
|
|
|
// First list the elements we want to keep.
|
|
unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
|
|
SmallVector<int, 16> ShuffV;
|
|
if (Subtarget.isLittleEndian())
|
|
for (unsigned i = 0; i < TrgNumElts; ++i)
|
|
ShuffV.push_back(i * SizeMult);
|
|
else
|
|
for (unsigned i = 1; i <= TrgNumElts; ++i)
|
|
ShuffV.push_back(i * SizeMult - 1);
|
|
|
|
// Populate the remaining elements with undefs.
|
|
for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
|
|
// ShuffV.push_back(i + WideNumElts);
|
|
ShuffV.push_back(WideNumElts + 1);
|
|
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc);
|
|
return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV);
|
|
}
|
|
|
|
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
|
|
/// possible.
|
|
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
|
|
// Not FP? Not a fsel.
|
|
if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
|
|
!Op.getOperand(2).getValueType().isFloatingPoint())
|
|
return Op;
|
|
|
|
bool HasNoInfs = DAG.getTarget().Options.NoInfsFPMath;
|
|
bool HasNoNaNs = DAG.getTarget().Options.NoNaNsFPMath;
|
|
// We might be able to do better than this under some circumstances, but in
|
|
// general, fsel-based lowering of select is a finite-math-only optimization.
|
|
// For more information, see section F.3 of the 2.06 ISA specification.
|
|
// With ISA 3.0, we have xsmaxcdp/xsmincdp which are OK to emit even in the
|
|
// presence of infinities.
|
|
if (!Subtarget.hasP9Vector() && (!HasNoInfs || !HasNoNaNs))
|
|
return Op;
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
|
|
|
|
EVT ResVT = Op.getValueType();
|
|
EVT CmpVT = Op.getOperand(0).getValueType();
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
|
SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
|
|
SDLoc dl(Op);
|
|
|
|
if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
|
|
switch (CC) {
|
|
default:
|
|
// Not a min/max but with finite math, we may still be able to use fsel.
|
|
if (HasNoInfs && HasNoNaNs)
|
|
break;
|
|
return Op;
|
|
case ISD::SETOGT:
|
|
case ISD::SETGT:
|
|
return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
|
|
case ISD::SETOLT:
|
|
case ISD::SETLT:
|
|
return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS);
|
|
}
|
|
}
|
|
|
|
// TODO: Propagate flags from the select rather than global settings.
|
|
SDNodeFlags Flags;
|
|
Flags.setNoInfs(true);
|
|
Flags.setNoNaNs(true);
|
|
|
|
// If the RHS of the comparison is a 0.0, we don't need to do the
|
|
// subtraction at all.
|
|
SDValue Sel1;
|
|
if (isFloatingPointZero(RHS))
|
|
switch (CC) {
|
|
default: break; // SETUO etc aren't handled by fsel.
|
|
case ISD::SETNE:
|
|
std::swap(TV, FV);
|
|
LLVM_FALLTHROUGH;
|
|
case ISD::SETEQ:
|
|
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
|
|
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
|
|
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
|
|
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
|
|
case ISD::SETULT:
|
|
case ISD::SETLT:
|
|
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
|
|
LLVM_FALLTHROUGH;
|
|
case ISD::SETOGE:
|
|
case ISD::SETGE:
|
|
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
|
|
case ISD::SETUGT:
|
|
case ISD::SETGT:
|
|
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
|
|
LLVM_FALLTHROUGH;
|
|
case ISD::SETOLE:
|
|
case ISD::SETLE:
|
|
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
|
|
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
|
|
}
|
|
|
|
SDValue Cmp;
|
|
switch (CC) {
|
|
default: break; // SETUO etc aren't handled by fsel.
|
|
case ISD::SETNE:
|
|
std::swap(TV, FV);
|
|
LLVM_FALLTHROUGH;
|
|
case ISD::SETEQ:
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
|
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
|
|
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
|
|
DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
|
|
case ISD::SETULT:
|
|
case ISD::SETLT:
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
|
|
case ISD::SETOGE:
|
|
case ISD::SETGE:
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
|
|
case ISD::SETUGT:
|
|
case ISD::SETGT:
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
|
|
case ISD::SETOLE:
|
|
case ISD::SETLE:
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
|
|
}
|
|
return Op;
|
|
}
|
|
|
|
void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
|
|
SelectionDAG &DAG,
|
|
const SDLoc &dl) const {
|
|
assert(Op.getOperand(0).getValueType().isFloatingPoint());
|
|
SDValue Src = Op.getOperand(0);
|
|
if (Src.getValueType() == MVT::f32)
|
|
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
|
|
|
|
SDValue Tmp;
|
|
switch (Op.getSimpleValueType().SimpleTy) {
|
|
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
|
|
case MVT::i32:
|
|
Tmp = DAG.getNode(
|
|
Op.getOpcode() == ISD::FP_TO_SINT
|
|
? PPCISD::FCTIWZ
|
|
: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
|
|
dl, MVT::f64, Src);
|
|
break;
|
|
case MVT::i64:
|
|
assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
|
|
"i64 FP_TO_UINT is supported only with FPCVT");
|
|
Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
|
|
PPCISD::FCTIDUZ,
|
|
dl, MVT::f64, Src);
|
|
break;
|
|
}
|
|
|
|
// Convert the FP value to an int value through memory.
|
|
bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
|
|
(Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
|
|
SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
|
|
int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
|
|
MachinePointerInfo MPI =
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
|
|
|
|
// Emit a store to the stack slot.
|
|
SDValue Chain;
|
|
if (i32Stack) {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineMemOperand *MMO =
|
|
MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
|
|
SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
|
|
Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
|
|
DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
|
|
} else
|
|
Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);
|
|
|
|
// Result is a load from the stack slot. If loading 4 bytes, make sure to
|
|
// add in a bias on big endian.
|
|
if (Op.getValueType() == MVT::i32 && !i32Stack) {
|
|
FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
|
|
DAG.getConstant(4, dl, FIPtr.getValueType()));
|
|
MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
|
|
}
|
|
|
|
RLI.Chain = Chain;
|
|
RLI.Ptr = FIPtr;
|
|
RLI.MPI = MPI;
|
|
}
|
|
|
|
/// Custom lowers floating point to integer conversions to use
|
|
/// the direct move instructions available in ISA 2.07 to avoid the
|
|
/// need for load/store combinations.
|
|
SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
|
|
SelectionDAG &DAG,
|
|
const SDLoc &dl) const {
|
|
assert(Op.getOperand(0).getValueType().isFloatingPoint());
|
|
SDValue Src = Op.getOperand(0);
|
|
|
|
if (Src.getValueType() == MVT::f32)
|
|
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
|
|
|
|
SDValue Tmp;
|
|
switch (Op.getSimpleValueType().SimpleTy) {
|
|
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
|
|
case MVT::i32:
|
|
Tmp = DAG.getNode(
|
|
Op.getOpcode() == ISD::FP_TO_SINT
|
|
? PPCISD::FCTIWZ
|
|
: (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ),
|
|
dl, MVT::f64, Src);
|
|
Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp);
|
|
break;
|
|
case MVT::i64:
|
|
assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
|
|
"i64 FP_TO_UINT is supported only with FPCVT");
|
|
Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
|
|
PPCISD::FCTIDUZ,
|
|
dl, MVT::f64, Src);
|
|
Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp);
|
|
break;
|
|
}
|
|
return Tmp;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
|
|
const SDLoc &dl) const {
|
|
|
|
// FP to INT conversions are legal for f128.
|
|
if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
|
|
return Op;
|
|
|
|
// Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
|
|
// PPC (the libcall is not available).
|
|
if (Op.getOperand(0).getValueType() == MVT::ppcf128) {
|
|
if (Op.getValueType() == MVT::i32) {
|
|
if (Op.getOpcode() == ISD::FP_TO_SINT) {
|
|
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
|
|
MVT::f64, Op.getOperand(0),
|
|
DAG.getIntPtrConstant(0, dl));
|
|
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
|
|
MVT::f64, Op.getOperand(0),
|
|
DAG.getIntPtrConstant(1, dl));
|
|
|
|
// Add the two halves of the long double in round-to-zero mode.
|
|
SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
|
|
|
|
// Now use a smaller FP_TO_SINT.
|
|
return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
|
|
}
|
|
if (Op.getOpcode() == ISD::FP_TO_UINT) {
|
|
const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
|
|
APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
|
|
SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
|
|
// X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
|
|
// FIXME: generated code sucks.
|
|
// TODO: Are there fast-math-flags to propagate to this FSUB?
|
|
SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128,
|
|
Op.getOperand(0), Tmp);
|
|
True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
|
|
True = DAG.getNode(ISD::ADD, dl, MVT::i32, True,
|
|
DAG.getConstant(0x80000000, dl, MVT::i32));
|
|
SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
|
|
Op.getOperand(0));
|
|
return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False,
|
|
ISD::SETGE);
|
|
}
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
|
|
return LowerFP_TO_INTDirectMove(Op, DAG, dl);
|
|
|
|
ReuseLoadInfo RLI;
|
|
LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
|
|
|
|
return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
|
|
RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
|
|
}
|
|
|
|
// We're trying to insert a regular store, S, and then a load, L. If the
|
|
// incoming value, O, is a load, we might just be able to have our load use the
|
|
// address used by O. However, we don't know if anything else will store to
|
|
// that address before we can load from it. To prevent this situation, we need
|
|
// to insert our load, L, into the chain as a peer of O. To do this, we give L
|
|
// the same chain operand as O, we create a token factor from the chain results
|
|
// of O and L, and we replace all uses of O's chain result with that token
|
|
// factor (see spliceIntoChain below for this last part).
|
|
bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
|
|
ReuseLoadInfo &RLI,
|
|
SelectionDAG &DAG,
|
|
ISD::LoadExtType ET) const {
|
|
SDLoc dl(Op);
|
|
if (ET == ISD::NON_EXTLOAD &&
|
|
(Op.getOpcode() == ISD::FP_TO_UINT ||
|
|
Op.getOpcode() == ISD::FP_TO_SINT) &&
|
|
isOperationLegalOrCustom(Op.getOpcode(),
|
|
Op.getOperand(0).getValueType())) {
|
|
|
|
LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
|
|
return true;
|
|
}
|
|
|
|
LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
|
|
if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
|
|
LD->isNonTemporal())
|
|
return false;
|
|
if (LD->getMemoryVT() != MemVT)
|
|
return false;
|
|
|
|
RLI.Ptr = LD->getBasePtr();
|
|
if (LD->isIndexed() && !LD->getOffset().isUndef()) {
|
|
assert(LD->getAddressingMode() == ISD::PRE_INC &&
|
|
"Non-pre-inc AM on PPC?");
|
|
RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
|
|
LD->getOffset());
|
|
}
|
|
|
|
RLI.Chain = LD->getChain();
|
|
RLI.MPI = LD->getPointerInfo();
|
|
RLI.IsDereferenceable = LD->isDereferenceable();
|
|
RLI.IsInvariant = LD->isInvariant();
|
|
RLI.Alignment = LD->getAlignment();
|
|
RLI.AAInfo = LD->getAAInfo();
|
|
RLI.Ranges = LD->getRanges();
|
|
|
|
RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
|
|
return true;
|
|
}
|
|
|
|
// Given the head of the old chain, ResChain, insert a token factor containing
|
|
// it and NewResChain, and make users of ResChain now be users of that token
|
|
// factor.
|
|
// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
|
|
void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
|
|
SDValue NewResChain,
|
|
SelectionDAG &DAG) const {
|
|
if (!ResChain)
|
|
return;
|
|
|
|
SDLoc dl(NewResChain);
|
|
|
|
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
|
NewResChain, DAG.getUNDEF(MVT::Other));
|
|
assert(TF.getNode() != NewResChain.getNode() &&
|
|
"A new TF really is required here");
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
|
|
DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
|
|
}
|
|
|
|
/// Analyze profitability of direct move
|
|
/// prefer float load to int load plus direct move
|
|
/// when there is no integer use of int load
|
|
bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
|
|
SDNode *Origin = Op.getOperand(0).getNode();
|
|
if (Origin->getOpcode() != ISD::LOAD)
|
|
return true;
|
|
|
|
// If there is no LXSIBZX/LXSIHZX, like Power8,
|
|
// prefer direct move if the memory size is 1 or 2 bytes.
|
|
MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
|
|
if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
|
|
return true;
|
|
|
|
for (SDNode::use_iterator UI = Origin->use_begin(),
|
|
UE = Origin->use_end();
|
|
UI != UE; ++UI) {
|
|
|
|
// Only look at the users of the loaded value.
|
|
if (UI.getUse().get().getResNo() != 0)
|
|
continue;
|
|
|
|
if (UI->getOpcode() != ISD::SINT_TO_FP &&
|
|
UI->getOpcode() != ISD::UINT_TO_FP)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// Custom lowers integer to floating point conversions to use
|
|
/// the direct move instructions available in ISA 2.07 to avoid the
|
|
/// need for load/store combinations.
|
|
SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
|
|
SelectionDAG &DAG,
|
|
const SDLoc &dl) const {
|
|
assert((Op.getValueType() == MVT::f32 ||
|
|
Op.getValueType() == MVT::f64) &&
|
|
"Invalid floating point type as target of conversion");
|
|
assert(Subtarget.hasFPCVT() &&
|
|
"Int to FP conversions with direct moves require FPCVT");
|
|
SDValue FP;
|
|
SDValue Src = Op.getOperand(0);
|
|
bool SinglePrec = Op.getValueType() == MVT::f32;
|
|
bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
|
|
bool Signed = Op.getOpcode() == ISD::SINT_TO_FP;
|
|
unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) :
|
|
(SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU);
|
|
|
|
if (WordInt) {
|
|
FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ,
|
|
dl, MVT::f64, Src);
|
|
FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
|
|
}
|
|
else {
|
|
FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src);
|
|
FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP);
|
|
}
|
|
|
|
return FP;
|
|
}
|
|
|
|
static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
|
|
|
|
EVT VecVT = Vec.getValueType();
|
|
assert(VecVT.isVector() && "Expected a vector type.");
|
|
assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
|
|
|
|
EVT EltVT = VecVT.getVectorElementType();
|
|
unsigned WideNumElts = 128 / EltVT.getSizeInBits();
|
|
EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
|
|
|
|
unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
|
|
SmallVector<SDValue, 16> Ops(NumConcat);
|
|
Ops[0] = Vec;
|
|
SDValue UndefVec = DAG.getUNDEF(VecVT);
|
|
for (unsigned i = 1; i < NumConcat; ++i)
|
|
Ops[i] = UndefVec;
|
|
|
|
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
|
|
const SDLoc &dl) const {
|
|
|
|
unsigned Opc = Op.getOpcode();
|
|
assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) &&
|
|
"Unexpected conversion type");
|
|
assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
|
|
"Supports conversions to v2f64/v4f32 only.");
|
|
|
|
bool SignedConv = Opc == ISD::SINT_TO_FP;
|
|
bool FourEltRes = Op.getValueType() == MVT::v4f32;
|
|
|
|
SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
|
|
EVT WideVT = Wide.getValueType();
|
|
unsigned WideNumElts = WideVT.getVectorNumElements();
|
|
MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
|
|
|
|
SmallVector<int, 16> ShuffV;
|
|
for (unsigned i = 0; i < WideNumElts; ++i)
|
|
ShuffV.push_back(i + WideNumElts);
|
|
|
|
int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
|
|
int SaveElts = FourEltRes ? 4 : 2;
|
|
if (Subtarget.isLittleEndian())
|
|
for (int i = 0; i < SaveElts; i++)
|
|
ShuffV[i * Stride] = i;
|
|
else
|
|
for (int i = 1; i <= SaveElts; i++)
|
|
ShuffV[i * Stride - 1] = i - 1;
|
|
|
|
SDValue ShuffleSrc2 =
|
|
SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
|
|
SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
|
|
unsigned ExtendOp =
|
|
SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST;
|
|
|
|
SDValue Extend;
|
|
if (!Subtarget.hasP9Altivec() && SignedConv) {
|
|
Arrange = DAG.getBitcast(IntermediateVT, Arrange);
|
|
Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
|
|
DAG.getValueType(Op.getOperand(0).getValueType()));
|
|
} else
|
|
Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange);
|
|
|
|
return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
|
|
EVT InVT = Op.getOperand(0).getValueType();
|
|
EVT OutVT = Op.getValueType();
|
|
if (OutVT.isVector() && OutVT.isFloatingPoint() &&
|
|
isOperationCustom(Op.getOpcode(), InVT))
|
|
return LowerINT_TO_FPVector(Op, DAG, dl);
|
|
|
|
// Conversions to f128 are legal.
|
|
if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
|
|
return Op;
|
|
|
|
if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
|
|
if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64)
|
|
return SDValue();
|
|
|
|
SDValue Value = Op.getOperand(0);
|
|
// The values are now known to be -1 (false) or 1 (true). To convert this
|
|
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
|
|
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
|
|
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
|
|
|
|
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
|
|
|
|
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
|
|
|
|
if (Op.getValueType() != MVT::v4f64)
|
|
Value = DAG.getNode(ISD::FP_ROUND, dl,
|
|
Op.getValueType(), Value,
|
|
DAG.getIntPtrConstant(1, dl));
|
|
return Value;
|
|
}
|
|
|
|
// Don't handle ppc_fp128 here; let it be lowered to a libcall.
|
|
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
|
|
return SDValue();
|
|
|
|
if (Op.getOperand(0).getValueType() == MVT::i1)
|
|
return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
|
|
DAG.getConstantFP(1.0, dl, Op.getValueType()),
|
|
DAG.getConstantFP(0.0, dl, Op.getValueType()));
|
|
|
|
// If we have direct moves, we can do all the conversion, skip the store/load
|
|
// however, without FPCVT we can't do most conversions.
|
|
if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
|
|
Subtarget.isPPC64() && Subtarget.hasFPCVT())
|
|
return LowerINT_TO_FPDirectMove(Op, DAG, dl);
|
|
|
|
assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
|
|
"UINT_TO_FP is supported only with FPCVT");
|
|
|
|
// If we have FCFIDS, then use it when converting to single-precision.
|
|
// Otherwise, convert to double-precision and then round.
|
|
unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
|
|
? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
|
|
: PPCISD::FCFIDS)
|
|
: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
|
|
: PPCISD::FCFID);
|
|
MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
|
|
? MVT::f32
|
|
: MVT::f64;
|
|
|
|
if (Op.getOperand(0).getValueType() == MVT::i64) {
|
|
SDValue SINT = Op.getOperand(0);
|
|
// When converting to single-precision, we actually need to convert
|
|
// to double-precision first and then round to single-precision.
|
|
// To avoid double-rounding effects during that operation, we have
|
|
// to prepare the input operand. Bits that might be truncated when
|
|
// converting to double-precision are replaced by a bit that won't
|
|
// be lost at this stage, but is below the single-precision rounding
|
|
// position.
|
|
//
|
|
// However, if -enable-unsafe-fp-math is in effect, accept double
|
|
// rounding to avoid the extra overhead.
|
|
if (Op.getValueType() == MVT::f32 &&
|
|
!Subtarget.hasFPCVT() &&
|
|
!DAG.getTarget().Options.UnsafeFPMath) {
|
|
|
|
// Twiddle input to make sure the low 11 bits are zero. (If this
|
|
// is the case, we are guaranteed the value will fit into the 53 bit
|
|
// mantissa of an IEEE double-precision value without rounding.)
|
|
// If any of those low 11 bits were not zero originally, make sure
|
|
// bit 12 (value 2048) is set instead, so that the final rounding
|
|
// to single-precision gets the correct result.
|
|
SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
|
|
SINT, DAG.getConstant(2047, dl, MVT::i64));
|
|
Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
|
|
Round, DAG.getConstant(2047, dl, MVT::i64));
|
|
Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
|
|
Round = DAG.getNode(ISD::AND, dl, MVT::i64,
|
|
Round, DAG.getConstant(-2048, dl, MVT::i64));
|
|
|
|
// However, we cannot use that value unconditionally: if the magnitude
|
|
// of the input value is small, the bit-twiddling we did above might
|
|
// end up visibly changing the output. Fortunately, in that case, we
|
|
// don't need to twiddle bits since the original input will convert
|
|
// exactly to double-precision floating-point already. Therefore,
|
|
// construct a conditional to use the original value if the top 11
|
|
// bits are all sign-bit copies, and use the rounded value computed
|
|
// above otherwise.
|
|
SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
|
|
SINT, DAG.getConstant(53, dl, MVT::i32));
|
|
Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
|
|
Cond, DAG.getConstant(1, dl, MVT::i64));
|
|
Cond = DAG.getSetCC(dl, MVT::i32,
|
|
Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
|
|
|
|
SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
|
|
}
|
|
|
|
ReuseLoadInfo RLI;
|
|
SDValue Bits;
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
|
|
Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
|
|
RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
|
|
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
|
|
} else if (Subtarget.hasLFIWAX() &&
|
|
canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
|
|
MachineMemOperand *MMO =
|
|
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
|
|
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
|
|
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
|
|
Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
|
|
DAG.getVTList(MVT::f64, MVT::Other),
|
|
Ops, MVT::i32, MMO);
|
|
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
|
|
} else if (Subtarget.hasFPCVT() &&
|
|
canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
|
|
MachineMemOperand *MMO =
|
|
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
|
|
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
|
|
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
|
|
Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
|
|
DAG.getVTList(MVT::f64, MVT::Other),
|
|
Ops, MVT::i32, MMO);
|
|
spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
|
|
} else if (((Subtarget.hasLFIWAX() &&
|
|
SINT.getOpcode() == ISD::SIGN_EXTEND) ||
|
|
(Subtarget.hasFPCVT() &&
|
|
SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
|
|
SINT.getOperand(0).getValueType() == MVT::i32) {
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
|
|
int FrameIdx = MFI.CreateStackObject(4, 4, false);
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
SDValue Store =
|
|
DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx,
|
|
MachinePointerInfo::getFixedStack(
|
|
DAG.getMachineFunction(), FrameIdx));
|
|
|
|
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
|
|
"Expected an i32 store");
|
|
|
|
RLI.Ptr = FIdx;
|
|
RLI.Chain = Store;
|
|
RLI.MPI =
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
|
|
RLI.Alignment = 4;
|
|
|
|
MachineMemOperand *MMO =
|
|
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
|
|
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
|
|
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
|
|
Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
|
|
PPCISD::LFIWZX : PPCISD::LFIWAX,
|
|
dl, DAG.getVTList(MVT::f64, MVT::Other),
|
|
Ops, MVT::i32, MMO);
|
|
} else
|
|
Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
|
|
|
|
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
|
|
|
|
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
|
|
FP = DAG.getNode(ISD::FP_ROUND, dl,
|
|
MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
|
|
return FP;
|
|
}
|
|
|
|
assert(Op.getOperand(0).getValueType() == MVT::i32 &&
|
|
"Unhandled INT_TO_FP type in custom expander!");
|
|
// Since we only generate this in 64-bit mode, we can take advantage of
|
|
// 64-bit registers. In particular, sign extend the input value into the
|
|
// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
|
|
// then lfd it and fcfid it.
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
|
|
SDValue Ld;
|
|
if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
|
|
ReuseLoadInfo RLI;
|
|
bool ReusingLoad;
|
|
if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
|
|
DAG))) {
|
|
int FrameIdx = MFI.CreateStackObject(4, 4, false);
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
SDValue Store =
|
|
DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
|
|
MachinePointerInfo::getFixedStack(
|
|
DAG.getMachineFunction(), FrameIdx));
|
|
|
|
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
|
|
"Expected an i32 store");
|
|
|
|
RLI.Ptr = FIdx;
|
|
RLI.Chain = Store;
|
|
RLI.MPI =
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
|
|
RLI.Alignment = 4;
|
|
}
|
|
|
|
MachineMemOperand *MMO =
|
|
MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
|
|
RLI.Alignment, RLI.AAInfo, RLI.Ranges);
|
|
SDValue Ops[] = { RLI.Chain, RLI.Ptr };
|
|
Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
|
|
PPCISD::LFIWZX : PPCISD::LFIWAX,
|
|
dl, DAG.getVTList(MVT::f64, MVT::Other),
|
|
Ops, MVT::i32, MMO);
|
|
if (ReusingLoad)
|
|
spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
|
|
} else {
|
|
assert(Subtarget.isPPC64() &&
|
|
"i32->FP without LFIWAX supported only on PPC64");
|
|
|
|
int FrameIdx = MFI.CreateStackObject(8, 8, false);
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
|
|
Op.getOperand(0));
|
|
|
|
// STD the extended value into the stack slot.
|
|
SDValue Store = DAG.getStore(
|
|
DAG.getEntryNode(), dl, Ext64, FIdx,
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
|
|
|
|
// Load the value as a double.
|
|
Ld = DAG.getLoad(
|
|
MVT::f64, dl, Store, FIdx,
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
|
|
}
|
|
|
|
// FCFID it and return it.
|
|
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
|
|
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
|
|
FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
|
|
DAG.getIntPtrConstant(0, dl));
|
|
return FP;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
/*
|
|
The rounding mode is in bits 30:31 of FPSR, and has the following
|
|
settings:
|
|
00 Round to nearest
|
|
01 Round to 0
|
|
10 Round to +inf
|
|
11 Round to -inf
|
|
|
|
FLT_ROUNDS, on the other hand, expects the following:
|
|
-1 Undefined
|
|
0 Round to 0
|
|
1 Round to nearest
|
|
2 Round to +inf
|
|
3 Round to -inf
|
|
|
|
To perform the conversion, we do:
|
|
((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
|
|
*/
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
EVT VT = Op.getValueType();
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
|
|
// Save FP Control Word to register
|
|
EVT NodeTys[] = {
|
|
MVT::f64, // return register
|
|
MVT::Glue // unused in this context
|
|
};
|
|
SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
|
|
|
|
// Save FP register to stack slot
|
|
int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
|
|
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
|
|
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
|
|
MachinePointerInfo());
|
|
|
|
// Load FP Control Word from low 32 bits of stack slot.
|
|
SDValue Four = DAG.getConstant(4, dl, PtrVT);
|
|
SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
|
|
SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
|
|
|
|
// Transform as necessary
|
|
SDValue CWD1 =
|
|
DAG.getNode(ISD::AND, dl, MVT::i32,
|
|
CWD, DAG.getConstant(3, dl, MVT::i32));
|
|
SDValue CWD2 =
|
|
DAG.getNode(ISD::SRL, dl, MVT::i32,
|
|
DAG.getNode(ISD::AND, dl, MVT::i32,
|
|
DAG.getNode(ISD::XOR, dl, MVT::i32,
|
|
CWD, DAG.getConstant(3, dl, MVT::i32)),
|
|
DAG.getConstant(3, dl, MVT::i32)),
|
|
DAG.getConstant(1, dl, MVT::i32));
|
|
|
|
SDValue RetVal =
|
|
DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
|
|
|
|
return DAG.getNode((VT.getSizeInBits() < 16 ?
|
|
ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
|
|
EVT VT = Op.getValueType();
|
|
unsigned BitWidth = VT.getSizeInBits();
|
|
SDLoc dl(Op);
|
|
assert(Op.getNumOperands() == 3 &&
|
|
VT == Op.getOperand(1).getValueType() &&
|
|
"Unexpected SHL!");
|
|
|
|
// Expand into a bunch of logical ops. Note that these ops
|
|
// depend on the PPC behavior for oversized shift amounts.
|
|
SDValue Lo = Op.getOperand(0);
|
|
SDValue Hi = Op.getOperand(1);
|
|
SDValue Amt = Op.getOperand(2);
|
|
EVT AmtVT = Amt.getValueType();
|
|
|
|
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
|
|
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
|
|
SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
|
|
SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
|
|
SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
|
|
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
|
|
DAG.getConstant(-BitWidth, dl, AmtVT));
|
|
SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
|
|
SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
|
|
SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
|
|
SDValue OutOps[] = { OutLo, OutHi };
|
|
return DAG.getMergeValues(OutOps, dl);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
|
|
EVT VT = Op.getValueType();
|
|
SDLoc dl(Op);
|
|
unsigned BitWidth = VT.getSizeInBits();
|
|
assert(Op.getNumOperands() == 3 &&
|
|
VT == Op.getOperand(1).getValueType() &&
|
|
"Unexpected SRL!");
|
|
|
|
// Expand into a bunch of logical ops. Note that these ops
|
|
// depend on the PPC behavior for oversized shift amounts.
|
|
SDValue Lo = Op.getOperand(0);
|
|
SDValue Hi = Op.getOperand(1);
|
|
SDValue Amt = Op.getOperand(2);
|
|
EVT AmtVT = Amt.getValueType();
|
|
|
|
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
|
|
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
|
|
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
|
|
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
|
|
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
|
|
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
|
|
DAG.getConstant(-BitWidth, dl, AmtVT));
|
|
SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
|
|
SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
|
|
SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
|
|
SDValue OutOps[] = { OutLo, OutHi };
|
|
return DAG.getMergeValues(OutOps, dl);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
EVT VT = Op.getValueType();
|
|
unsigned BitWidth = VT.getSizeInBits();
|
|
assert(Op.getNumOperands() == 3 &&
|
|
VT == Op.getOperand(1).getValueType() &&
|
|
"Unexpected SRA!");
|
|
|
|
// Expand into a bunch of logical ops, followed by a select_cc.
|
|
SDValue Lo = Op.getOperand(0);
|
|
SDValue Hi = Op.getOperand(1);
|
|
SDValue Amt = Op.getOperand(2);
|
|
EVT AmtVT = Amt.getValueType();
|
|
|
|
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
|
|
DAG.getConstant(BitWidth, dl, AmtVT), Amt);
|
|
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
|
|
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
|
|
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
|
|
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
|
|
DAG.getConstant(-BitWidth, dl, AmtVT));
|
|
SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
|
|
SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
|
|
SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
|
|
Tmp4, Tmp6, ISD::SETLE);
|
|
SDValue OutOps[] = { OutLo, OutHi };
|
|
return DAG.getMergeValues(OutOps, dl);
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Vector related lowering.
|
|
//
|
|
|
|
/// BuildSplatI - Build a canonical splati of Val with an element size of
|
|
/// SplatSize. Cast the result to VT.
|
|
static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
|
|
SelectionDAG &DAG, const SDLoc &dl) {
|
|
static const MVT VTys[] = { // canonical VT to use for each size.
|
|
MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
|
|
};
|
|
|
|
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
|
|
|
|
// Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
|
|
if (Val == -1)
|
|
SplatSize = 1;
|
|
|
|
EVT CanonicalVT = VTys[SplatSize-1];
|
|
|
|
// Build a canonical splat for this value.
|
|
return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
|
|
}
|
|
|
|
/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
|
|
/// specified intrinsic ID.
|
|
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
|
|
const SDLoc &dl, EVT DestVT = MVT::Other) {
|
|
if (DestVT == MVT::Other) DestVT = Op.getValueType();
|
|
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
|
|
DAG.getConstant(IID, dl, MVT::i32), Op);
|
|
}
|
|
|
|
/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
|
|
/// specified intrinsic ID.
|
|
static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
|
|
SelectionDAG &DAG, const SDLoc &dl,
|
|
EVT DestVT = MVT::Other) {
|
|
if (DestVT == MVT::Other) DestVT = LHS.getValueType();
|
|
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
|
|
DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
|
|
}
|
|
|
|
/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
|
|
/// specified intrinsic ID.
|
|
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
|
|
SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
|
|
EVT DestVT = MVT::Other) {
|
|
if (DestVT == MVT::Other) DestVT = Op0.getValueType();
|
|
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
|
|
DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
|
|
}
|
|
|
|
/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
|
|
/// amount. The result has the specified value type.
|
|
static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
|
|
SelectionDAG &DAG, const SDLoc &dl) {
|
|
// Force LHS/RHS to be the right type.
|
|
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
|
|
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
|
|
|
|
int Ops[16];
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
Ops[i] = i + Amt;
|
|
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
|
|
return DAG.getNode(ISD::BITCAST, dl, VT, T);
|
|
}
|
|
|
|
/// Do we have an efficient pattern in a .td file for this node?
|
|
///
|
|
/// \param V - pointer to the BuildVectorSDNode being matched
|
|
/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
|
|
///
|
|
/// There are some patterns where it is beneficial to keep a BUILD_VECTOR
|
|
/// node as a BUILD_VECTOR node rather than expanding it. The patterns where
|
|
/// the opposite is true (expansion is beneficial) are:
|
|
/// - The node builds a vector out of integers that are not 32 or 64-bits
|
|
/// - The node builds a vector out of constants
|
|
/// - The node is a "load-and-splat"
|
|
/// In all other cases, we will choose to keep the BUILD_VECTOR.
|
|
static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
|
|
bool HasDirectMove,
|
|
bool HasP8Vector) {
|
|
EVT VecVT = V->getValueType(0);
|
|
bool RightType = VecVT == MVT::v2f64 ||
|
|
(HasP8Vector && VecVT == MVT::v4f32) ||
|
|
(HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
|
|
if (!RightType)
|
|
return false;
|
|
|
|
bool IsSplat = true;
|
|
bool IsLoad = false;
|
|
SDValue Op0 = V->getOperand(0);
|
|
|
|
// This function is called in a block that confirms the node is not a constant
|
|
// splat. So a constant BUILD_VECTOR here means the vector is built out of
|
|
// different constants.
|
|
if (V->isConstant())
|
|
return false;
|
|
for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
|
|
if (V->getOperand(i).isUndef())
|
|
return false;
|
|
// We want to expand nodes that represent load-and-splat even if the
|
|
// loaded value is a floating point truncation or conversion to int.
|
|
if (V->getOperand(i).getOpcode() == ISD::LOAD ||
|
|
(V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
|
|
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
|
|
(V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
|
|
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
|
|
(V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
|
|
V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
|
|
IsLoad = true;
|
|
// If the operands are different or the input is not a load and has more
|
|
// uses than just this BV node, then it isn't a splat.
|
|
if (V->getOperand(i) != Op0 ||
|
|
(!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
|
|
IsSplat = false;
|
|
}
|
|
return !(IsSplat && IsLoad);
|
|
}
|
|
|
|
// Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
|
|
SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
SDLoc dl(Op);
|
|
SDValue Op0 = Op->getOperand(0);
|
|
|
|
if (!EnableQuadPrecision ||
|
|
(Op.getValueType() != MVT::f128 ) ||
|
|
(Op0.getOpcode() != ISD::BUILD_PAIR) ||
|
|
(Op0.getOperand(0).getValueType() != MVT::i64) ||
|
|
(Op0.getOperand(1).getValueType() != MVT::i64))
|
|
return SDValue();
|
|
|
|
return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
|
|
Op0.getOperand(1));
|
|
}
|
|
|
|
static const SDValue *getNormalLoadInput(const SDValue &Op) {
|
|
const SDValue *InputLoad = &Op;
|
|
if (InputLoad->getOpcode() == ISD::BITCAST)
|
|
InputLoad = &InputLoad->getOperand(0);
|
|
if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
|
|
InputLoad = &InputLoad->getOperand(0);
|
|
if (InputLoad->getOpcode() != ISD::LOAD)
|
|
return nullptr;
|
|
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
|
|
return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
|
|
}
|
|
|
|
// If this is a case we can't handle, return null and let the default
|
|
// expansion code take care of it. If we CAN select this case, and if it
|
|
// selects to a single instruction, return Op. Otherwise, if we can codegen
|
|
// this case more efficiently than a constant pool load, lower it to the
|
|
// sequence of ops that should be used.
|
|
SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
|
|
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
|
|
|
|
if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) {
|
|
// We first build an i32 vector, load it into a QPX register,
|
|
// then convert it to a floating-point vector and compare it
|
|
// to a zero vector to get the boolean result.
|
|
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
|
|
int FrameIdx = MFI.CreateStackObject(16, 16, false);
|
|
MachinePointerInfo PtrInfo =
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
assert(BVN->getNumOperands() == 4 &&
|
|
"BUILD_VECTOR for v4i1 does not have 4 operands");
|
|
|
|
bool IsConst = true;
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
if (BVN->getOperand(i).isUndef()) continue;
|
|
if (!isa<ConstantSDNode>(BVN->getOperand(i))) {
|
|
IsConst = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (IsConst) {
|
|
Constant *One =
|
|
ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0);
|
|
Constant *NegOne =
|
|
ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0);
|
|
|
|
Constant *CV[4];
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
if (BVN->getOperand(i).isUndef())
|
|
CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext()));
|
|
else if (isNullConstant(BVN->getOperand(i)))
|
|
CV[i] = NegOne;
|
|
else
|
|
CV[i] = One;
|
|
}
|
|
|
|
Constant *CP = ConstantVector::get(CV);
|
|
SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
|
|
16 /* alignment */);
|
|
|
|
SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
|
|
SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
|
|
return DAG.getMemIntrinsicNode(
|
|
PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32,
|
|
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
|
|
}
|
|
|
|
SmallVector<SDValue, 4> Stores;
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
if (BVN->getOperand(i).isUndef()) continue;
|
|
|
|
unsigned Offset = 4*i;
|
|
SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
|
|
Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
|
|
|
|
unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize();
|
|
if (StoreSize > 4) {
|
|
Stores.push_back(
|
|
DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx,
|
|
PtrInfo.getWithOffset(Offset), MVT::i32));
|
|
} else {
|
|
SDValue StoreValue = BVN->getOperand(i);
|
|
if (StoreSize < 4)
|
|
StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue);
|
|
|
|
Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx,
|
|
PtrInfo.getWithOffset(Offset)));
|
|
}
|
|
}
|
|
|
|
SDValue StoreChain;
|
|
if (!Stores.empty())
|
|
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
|
|
else
|
|
StoreChain = DAG.getEntryNode();
|
|
|
|
// Now load from v4i32 into the QPX register; this will extend it to
|
|
// v4i64 but not yet convert it to a floating point. Nevertheless, this
|
|
// is typed as v4f64 because the QPX register integer states are not
|
|
// explicitly represented.
|
|
|
|
SDValue Ops[] = {StoreChain,
|
|
DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32),
|
|
FIdx};
|
|
SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other});
|
|
|
|
SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,
|
|
dl, VTs, Ops, MVT::v4i32, PtrInfo);
|
|
LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
|
|
DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32),
|
|
LoadedVect);
|
|
|
|
SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64);
|
|
|
|
return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ);
|
|
}
|
|
|
|
// All other QPX vectors are handled by generic code.
|
|
if (Subtarget.hasQPX())
|
|
return SDValue();
|
|
|
|
// Check if this is a splat of a constant value.
|
|
APInt APSplatBits, APSplatUndef;
|
|
unsigned SplatBitSize;
|
|
bool HasAnyUndefs;
|
|
if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
|
|
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
|
|
SplatBitSize > 32) {
|
|
|
|
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
|
|
// Handle load-and-splat patterns as we have instructions that will do this
|
|
// in one go.
|
|
if (InputLoad && DAG.isSplatValue(Op, true)) {
|
|
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
|
|
|
|
// We have handling for 4 and 8 byte elements.
|
|
unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
|
|
|
|
// Checking for a single use of this load, we have to check for vector
|
|
// width (128 bits) / ElementSize uses (since each operand of the
|
|
// BUILD_VECTOR is a separate use of the value.
|
|
if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
|
|
((Subtarget.hasVSX() && ElementSize == 64) ||
|
|
(Subtarget.hasP9Vector() && ElementSize == 32))) {
|
|
SDValue Ops[] = {
|
|
LD->getChain(), // Chain
|
|
LD->getBasePtr(), // Ptr
|
|
DAG.getValueType(Op.getValueType()) // VT
|
|
};
|
|
return
|
|
DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
|
|
DAG.getVTList(Op.getValueType(), MVT::Other),
|
|
Ops, LD->getMemoryVT(), LD->getMemOperand());
|
|
}
|
|
}
|
|
|
|
// BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
|
|
// lowered to VSX instructions under certain conditions.
|
|
// Without VSX, there is no pattern more efficient than expanding the node.
|
|
if (Subtarget.hasVSX() &&
|
|
haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
|
|
Subtarget.hasP8Vector()))
|
|
return Op;
|
|
return SDValue();
|
|
}
|
|
|
|
unsigned SplatBits = APSplatBits.getZExtValue();
|
|
unsigned SplatUndef = APSplatUndef.getZExtValue();
|
|
unsigned SplatSize = SplatBitSize / 8;
|
|
|
|
// First, handle single instruction cases.
|
|
|
|
// All zeros?
|
|
if (SplatBits == 0) {
|
|
// Canonicalize all zero vectors to be v4i32.
|
|
if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
|
|
SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
|
|
Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
|
|
}
|
|
return Op;
|
|
}
|
|
|
|
// We have XXSPLTIB for constant splats one byte wide
|
|
// FIXME: SplatBits is an unsigned int being cast to an int while passing it
|
|
// as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.
|
|
if (Subtarget.hasP9Vector() && SplatSize == 1)
|
|
return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);
|
|
|
|
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
|
|
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
|
|
(32-SplatBitSize));
|
|
if (SextVal >= -16 && SextVal <= 15)
|
|
return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
|
|
|
|
// Two instruction sequences.
|
|
|
|
// If this value is in the range [-32,30] and is even, use:
|
|
// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
|
|
// If this value is in the range [17,31] and is odd, use:
|
|
// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
|
|
// If this value is in the range [-31,-17] and is odd, use:
|
|
// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
|
|
// Note the last two are three-instruction sequences.
|
|
if (SextVal >= -32 && SextVal <= 31) {
|
|
// To avoid having these optimizations undone by constant folding,
|
|
// we convert to a pseudo that will be expanded later into one of
|
|
// the above forms.
|
|
SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
|
|
EVT VT = (SplatSize == 1 ? MVT::v16i8 :
|
|
(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
|
|
SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
|
|
SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
|
|
if (VT == Op.getValueType())
|
|
return RetVal;
|
|
else
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
|
|
}
|
|
|
|
// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
|
|
// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
|
|
// for fneg/fabs.
|
|
if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
|
|
// Make -1 and vspltisw -1:
|
|
SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
|
|
|
|
// Make the VSLW intrinsic, computing 0x8000_0000.
|
|
SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
|
|
OnesV, DAG, dl);
|
|
|
|
// xor by OnesV to invert it.
|
|
Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
|
}
|
|
|
|
// Check to see if this is a wide variety of vsplti*, binop self cases.
|
|
static const signed char SplatCsts[] = {
|
|
-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
|
|
-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
|
|
};
|
|
|
|
for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
|
|
// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
|
|
// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
|
|
int i = SplatCsts[idx];
|
|
|
|
// Figure out what shift amount will be used by altivec if shifted by i in
|
|
// this splat size.
|
|
unsigned TypeShiftAmt = i & (SplatBitSize-1);
|
|
|
|
// vsplti + shl self.
|
|
if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
|
|
Intrinsic::ppc_altivec_vslw
|
|
};
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
|
}
|
|
|
|
// vsplti + srl self.
|
|
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
|
|
Intrinsic::ppc_altivec_vsrw
|
|
};
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
|
}
|
|
|
|
// vsplti + sra self.
|
|
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
|
|
Intrinsic::ppc_altivec_vsraw
|
|
};
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
|
}
|
|
|
|
// vsplti + rol self.
|
|
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
|
|
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
|
|
Intrinsic::ppc_altivec_vrlw
|
|
};
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
|
}
|
|
|
|
// t = vsplti c, result = vsldoi t, t, 1
|
|
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
|
|
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
|
|
unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
|
|
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
|
|
}
|
|
// t = vsplti c, result = vsldoi t, t, 2
|
|
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
|
|
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
|
|
unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
|
|
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
|
|
}
|
|
// t = vsplti c, result = vsldoi t, t, 3
|
|
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
|
|
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
|
|
unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
|
|
return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
|
|
}
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
|
|
/// the specified operations to build the shuffle.
|
|
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
|
|
SDValue RHS, SelectionDAG &DAG,
|
|
const SDLoc &dl) {
|
|
unsigned OpNum = (PFEntry >> 26) & 0x0F;
|
|
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
|
|
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
|
|
|
|
enum {
|
|
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
|
|
OP_VMRGHW,
|
|
OP_VMRGLW,
|
|
OP_VSPLTISW0,
|
|
OP_VSPLTISW1,
|
|
OP_VSPLTISW2,
|
|
OP_VSPLTISW3,
|
|
OP_VSLDOI4,
|
|
OP_VSLDOI8,
|
|
OP_VSLDOI12
|
|
};
|
|
|
|
if (OpNum == OP_COPY) {
|
|
if (LHSID == (1*9+2)*9+3) return LHS;
|
|
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
|
|
return RHS;
|
|
}
|
|
|
|
SDValue OpLHS, OpRHS;
|
|
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
|
|
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
|
|
|
|
int ShufIdxs[16];
|
|
switch (OpNum) {
|
|
default: llvm_unreachable("Unknown i32 permute!");
|
|
case OP_VMRGHW:
|
|
ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
|
|
ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
|
|
ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
|
|
ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
|
|
break;
|
|
case OP_VMRGLW:
|
|
ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
|
|
ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
|
|
ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
|
|
ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
|
|
break;
|
|
case OP_VSPLTISW0:
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
ShufIdxs[i] = (i&3)+0;
|
|
break;
|
|
case OP_VSPLTISW1:
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
ShufIdxs[i] = (i&3)+4;
|
|
break;
|
|
case OP_VSPLTISW2:
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
ShufIdxs[i] = (i&3)+8;
|
|
break;
|
|
case OP_VSPLTISW3:
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
ShufIdxs[i] = (i&3)+12;
|
|
break;
|
|
case OP_VSLDOI4:
|
|
return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
|
|
case OP_VSLDOI8:
|
|
return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
|
|
case OP_VSLDOI12:
|
|
return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
|
|
}
|
|
EVT VT = OpLHS.getValueType();
|
|
OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
|
|
OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
|
|
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
|
|
return DAG.getNode(ISD::BITCAST, dl, VT, T);
|
|
}
|
|
|
|
/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
|
|
/// by the VINSERTB instruction introduced in ISA 3.0, else just return default
|
|
/// SDValue.
|
|
SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
|
|
SelectionDAG &DAG) const {
|
|
const unsigned BytesInVector = 16;
|
|
bool IsLE = Subtarget.isLittleEndian();
|
|
SDLoc dl(N);
|
|
SDValue V1 = N->getOperand(0);
|
|
SDValue V2 = N->getOperand(1);
|
|
unsigned ShiftElts = 0, InsertAtByte = 0;
|
|
bool Swap = false;
|
|
|
|
// Shifts required to get the byte we want at element 7.
|
|
unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
|
|
0, 15, 14, 13, 12, 11, 10, 9};
|
|
unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
|
|
1, 2, 3, 4, 5, 6, 7, 8};
|
|
|
|
ArrayRef<int> Mask = N->getMask();
|
|
int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
|
|
|
// For each mask element, find out if we're just inserting something
|
|
// from V2 into V1 or vice versa.
|
|
// Possible permutations inserting an element from V2 into V1:
|
|
// X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
// 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
// ...
|
|
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
|
|
// Inserting from V1 into V2 will be similar, except mask range will be
|
|
// [16,31].
|
|
|
|
bool FoundCandidate = false;
|
|
// If both vector operands for the shuffle are the same vector, the mask
|
|
// will contain only elements from the first one and the second one will be
|
|
// undef.
|
|
unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
|
|
// Go through the mask of half-words to find an element that's being moved
|
|
// from one vector to the other.
|
|
for (unsigned i = 0; i < BytesInVector; ++i) {
|
|
unsigned CurrentElement = Mask[i];
|
|
// If 2nd operand is undefined, we should only look for element 7 in the
|
|
// Mask.
|
|
if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
|
|
continue;
|
|
|
|
bool OtherElementsInOrder = true;
|
|
// Examine the other elements in the Mask to see if they're in original
|
|
// order.
|
|
for (unsigned j = 0; j < BytesInVector; ++j) {
|
|
if (j == i)
|
|
continue;
|
|
// If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
|
|
// from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
|
|
// in which we always assume we're always picking from the 1st operand.
|
|
int MaskOffset =
|
|
(!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
|
|
if (Mask[j] != OriginalOrder[j] + MaskOffset) {
|
|
OtherElementsInOrder = false;
|
|
break;
|
|
}
|
|
}
|
|
// If other elements are in original order, we record the number of shifts
|
|
// we need to get the element we want into element 7. Also record which byte
|
|
// in the vector we should insert into.
|
|
if (OtherElementsInOrder) {
|
|
// If 2nd operand is undefined, we assume no shifts and no swapping.
|
|
if (V2.isUndef()) {
|
|
ShiftElts = 0;
|
|
Swap = false;
|
|
} else {
|
|
// Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
|
|
ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
|
|
: BigEndianShifts[CurrentElement & 0xF];
|
|
Swap = CurrentElement < BytesInVector;
|
|
}
|
|
InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
|
|
FoundCandidate = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!FoundCandidate)
|
|
return SDValue();
|
|
|
|
// Candidate found, construct the proper SDAG sequence with VINSERTB,
|
|
// optionally with VECSHL if shift is required.
|
|
if (Swap)
|
|
std::swap(V1, V2);
|
|
if (V2.isUndef())
|
|
V2 = V1;
|
|
if (ShiftElts) {
|
|
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
|
|
DAG.getConstant(ShiftElts, dl, MVT::i32));
|
|
return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
}
|
|
return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
}
|
|
|
|
/// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
|
|
/// by the VINSERTH instruction introduced in ISA 3.0, else just return default
|
|
/// SDValue.
|
|
SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
|
|
SelectionDAG &DAG) const {
|
|
const unsigned NumHalfWords = 8;
|
|
const unsigned BytesInVector = NumHalfWords * 2;
|
|
// Check that the shuffle is on half-words.
|
|
if (!isNByteElemShuffleMask(N, 2, 1))
|
|
return SDValue();
|
|
|
|
bool IsLE = Subtarget.isLittleEndian();
|
|
SDLoc dl(N);
|
|
SDValue V1 = N->getOperand(0);
|
|
SDValue V2 = N->getOperand(1);
|
|
unsigned ShiftElts = 0, InsertAtByte = 0;
|
|
bool Swap = false;
|
|
|
|
// Shifts required to get the half-word we want at element 3.
|
|
unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
|
|
unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
|
|
|
|
uint32_t Mask = 0;
|
|
uint32_t OriginalOrderLow = 0x1234567;
|
|
uint32_t OriginalOrderHigh = 0x89ABCDEF;
|
|
// Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
|
|
// 32-bit space, only need 4-bit nibbles per element.
|
|
for (unsigned i = 0; i < NumHalfWords; ++i) {
|
|
unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
|
|
Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
|
|
}
|
|
|
|
// For each mask element, find out if we're just inserting something
|
|
// from V2 into V1 or vice versa. Possible permutations inserting an element
|
|
// from V2 into V1:
|
|
// X, 1, 2, 3, 4, 5, 6, 7
|
|
// 0, X, 2, 3, 4, 5, 6, 7
|
|
// 0, 1, X, 3, 4, 5, 6, 7
|
|
// 0, 1, 2, X, 4, 5, 6, 7
|
|
// 0, 1, 2, 3, X, 5, 6, 7
|
|
// 0, 1, 2, 3, 4, X, 6, 7
|
|
// 0, 1, 2, 3, 4, 5, X, 7
|
|
// 0, 1, 2, 3, 4, 5, 6, X
|
|
// Inserting from V1 into V2 will be similar, except mask range will be [8,15].
|
|
|
|
bool FoundCandidate = false;
|
|
// Go through the mask of half-words to find an element that's being moved
|
|
// from one vector to the other.
|
|
for (unsigned i = 0; i < NumHalfWords; ++i) {
|
|
unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
|
|
uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
|
|
uint32_t MaskOtherElts = ~(0xF << MaskShift);
|
|
uint32_t TargetOrder = 0x0;
|
|
|
|
// If both vector operands for the shuffle are the same vector, the mask
|
|
// will contain only elements from the first one and the second one will be
|
|
// undef.
|
|
if (V2.isUndef()) {
|
|
ShiftElts = 0;
|
|
unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
|
|
TargetOrder = OriginalOrderLow;
|
|
Swap = false;
|
|
// Skip if not the correct element or mask of other elements don't equal
|
|
// to our expected order.
|
|
if (MaskOneElt == VINSERTHSrcElem &&
|
|
(Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
|
|
InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
|
|
FoundCandidate = true;
|
|
break;
|
|
}
|
|
} else { // If both operands are defined.
|
|
// Target order is [8,15] if the current mask is between [0,7].
|
|
TargetOrder =
|
|
(MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
|
|
// Skip if mask of other elements don't equal our expected order.
|
|
if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
|
|
// We only need the last 3 bits for the number of shifts.
|
|
ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
|
|
: BigEndianShifts[MaskOneElt & 0x7];
|
|
InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
|
|
Swap = MaskOneElt < NumHalfWords;
|
|
FoundCandidate = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!FoundCandidate)
|
|
return SDValue();
|
|
|
|
// Candidate found, construct the proper SDAG sequence with VINSERTH,
|
|
// optionally with VECSHL if shift is required.
|
|
if (Swap)
|
|
std::swap(V1, V2);
|
|
if (V2.isUndef())
|
|
V2 = V1;
|
|
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
|
|
if (ShiftElts) {
|
|
// Double ShiftElts because we're left shifting on v16i8 type.
|
|
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
|
|
DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
|
|
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
|
|
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
|
}
|
|
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
|
|
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
|
}
|
|
|
|
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
|
|
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
|
|
/// return the code it can be lowered into. Worst case, it can always be
|
|
/// lowered into a vperm.
|
|
SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
SDValue V1 = Op.getOperand(0);
|
|
SDValue V2 = Op.getOperand(1);
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
|
|
EVT VT = Op.getValueType();
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
|
|
unsigned ShiftElts, InsertAtByte;
|
|
bool Swap = false;
|
|
|
|
// If this is a load-and-splat, we can do that with a single instruction
|
|
// in some cases. However if the load has multiple uses, we don't want to
|
|
// combine it because that will just produce multiple loads.
|
|
const SDValue *InputLoad = getNormalLoadInput(V1);
|
|
if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
|
|
(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
|
|
InputLoad->hasOneUse()) {
|
|
bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
|
|
int SplatIdx =
|
|
PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
|
|
// For 4-byte load-and-splat, we need Power9.
|
|
if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
|
|
uint64_t Offset = 0;
|
|
if (IsFourByte)
|
|
Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
|
|
else
|
|
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
|
|
SDValue BasePtr = LD->getBasePtr();
|
|
if (Offset != 0)
|
|
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
|
|
BasePtr, DAG.getIntPtrConstant(Offset, dl));
|
|
SDValue Ops[] = {
|
|
LD->getChain(), // Chain
|
|
BasePtr, // BasePtr
|
|
DAG.getValueType(Op.getValueType()) // VT
|
|
};
|
|
SDVTList VTL =
|
|
DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
|
|
SDValue LdSplt =
|
|
DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
|
|
Ops, LD->getMemoryVT(), LD->getMemOperand());
|
|
if (LdSplt.getValueType() != SVOp->getValueType(0))
|
|
LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
|
|
return LdSplt;
|
|
}
|
|
}
|
|
if (Subtarget.hasP9Vector() &&
|
|
PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
|
|
isLittleEndian)) {
|
|
if (Swap)
|
|
std::swap(V1, V2);
|
|
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
|
|
SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
|
|
if (ShiftElts) {
|
|
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
|
|
DAG.getConstant(ShiftElts, dl, MVT::i32));
|
|
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
|
}
|
|
SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
|
}
|
|
|
|
if (Subtarget.hasP9Altivec()) {
|
|
SDValue NewISDNode;
|
|
if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
|
|
return NewISDNode;
|
|
|
|
if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
|
|
return NewISDNode;
|
|
}
|
|
|
|
if (Subtarget.hasVSX() &&
|
|
PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
|
|
if (Swap)
|
|
std::swap(V1, V2);
|
|
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
|
|
SDValue Conv2 =
|
|
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
|
|
|
|
SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
|
|
DAG.getConstant(ShiftElts, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
|
|
}
|
|
|
|
if (Subtarget.hasVSX() &&
|
|
PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
|
|
if (Swap)
|
|
std::swap(V1, V2);
|
|
SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
|
|
SDValue Conv2 =
|
|
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
|
|
|
|
SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
|
|
DAG.getConstant(ShiftElts, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
|
|
}
|
|
|
|
if (Subtarget.hasP9Vector()) {
|
|
if (PPC::isXXBRHShuffleMask(SVOp)) {
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
|
|
SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
|
|
} else if (PPC::isXXBRWShuffleMask(SVOp)) {
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
|
|
SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
|
|
} else if (PPC::isXXBRDShuffleMask(SVOp)) {
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
|
|
SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
|
|
} else if (PPC::isXXBRQShuffleMask(SVOp)) {
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
|
|
SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
|
|
}
|
|
}
|
|
|
|
if (Subtarget.hasVSX()) {
|
|
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
|
|
int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
|
|
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
|
|
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
|
|
DAG.getConstant(SplatIdx, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
|
|
}
|
|
|
|
// Left shifts of 8 bytes are actually swaps. Convert accordingly.
|
|
if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
|
|
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
|
|
SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
|
|
}
|
|
}
|
|
|
|
if (Subtarget.hasQPX()) {
|
|
if (VT.getVectorNumElements() != 4)
|
|
return SDValue();
|
|
|
|
if (V2.isUndef()) V2 = V1;
|
|
|
|
int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp);
|
|
if (AlignIdx != -1) {
|
|
return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2,
|
|
DAG.getConstant(AlignIdx, dl, MVT::i32));
|
|
} else if (SVOp->isSplat()) {
|
|
int SplatIdx = SVOp->getSplatIndex();
|
|
if (SplatIdx >= 4) {
|
|
std::swap(V1, V2);
|
|
SplatIdx -= 4;
|
|
}
|
|
|
|
return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1,
|
|
DAG.getConstant(SplatIdx, dl, MVT::i32));
|
|
}
|
|
|
|
// Lower this into a qvgpci/qvfperm pair.
|
|
|
|
// Compute the qvgpci literal
|
|
unsigned idx = 0;
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
int m = SVOp->getMaskElt(i);
|
|
unsigned mm = m >= 0 ? (unsigned) m : i;
|
|
idx |= mm << (3-i)*3;
|
|
}
|
|
|
|
SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64,
|
|
DAG.getConstant(idx, dl, MVT::i32));
|
|
return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3);
|
|
}
|
|
|
|
// Cases that are handled by instructions that take permute immediates
|
|
// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
|
|
// selected by the instruction selector.
|
|
if (V2.isUndef()) {
|
|
if (PPC::isSplatShuffleMask(SVOp, 1) ||
|
|
PPC::isSplatShuffleMask(SVOp, 2) ||
|
|
PPC::isSplatShuffleMask(SVOp, 4) ||
|
|
PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
|
|
PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
|
|
PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
|
|
PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
|
|
PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
|
|
PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
|
|
PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
|
|
PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
|
|
PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
|
|
(Subtarget.hasP8Altivec() && (
|
|
PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
|
|
PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
|
|
PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
|
|
return Op;
|
|
}
|
|
}
|
|
|
|
// Altivec has a variety of "shuffle immediates" that take two vector inputs
|
|
// and produce a fixed permutation. If any of these match, do not lower to
|
|
// VPERM.
|
|
unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
|
|
if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
|
|
PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
|
|
PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
|
|
PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
|
|
PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
|
|
PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
|
|
PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
|
|
PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
|
|
PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
|
|
(Subtarget.hasP8Altivec() && (
|
|
PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
|
|
PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
|
|
PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
|
|
return Op;
|
|
|
|
// Check to see if this is a shuffle of 4-byte values. If so, we can use our
|
|
// perfect shuffle table to emit an optimal matching sequence.
|
|
ArrayRef<int> PermMask = SVOp->getMask();
|
|
|
|
unsigned PFIndexes[4];
|
|
bool isFourElementShuffle = true;
|
|
for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
|
|
unsigned EltNo = 8; // Start out undef.
|
|
for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
|
|
if (PermMask[i*4+j] < 0)
|
|
continue; // Undef, ignore it.
|
|
|
|
unsigned ByteSource = PermMask[i*4+j];
|
|
if ((ByteSource & 3) != j) {
|
|
isFourElementShuffle = false;
|
|
break;
|
|
}
|
|
|
|
if (EltNo == 8) {
|
|
EltNo = ByteSource/4;
|
|
} else if (EltNo != ByteSource/4) {
|
|
isFourElementShuffle = false;
|
|
break;
|
|
}
|
|
}
|
|
PFIndexes[i] = EltNo;
|
|
}
|
|
|
|
// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
|
|
// perfect shuffle vector to determine if it is cost effective to do this as
|
|
// discrete instructions, or whether we should use a vperm.
|
|
// For now, we skip this for little endian until such time as we have a
|
|
// little-endian perfect shuffle table.
|
|
if (isFourElementShuffle && !isLittleEndian) {
|
|
// Compute the index in the perfect shuffle table.
|
|
unsigned PFTableIndex =
|
|
PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
|
|
|
|
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
|
|
unsigned Cost = (PFEntry >> 30);
|
|
|
|
// Determining when to avoid vperm is tricky. Many things affect the cost
|
|
// of vperm, particularly how many times the perm mask needs to be computed.
|
|
// For example, if the perm mask can be hoisted out of a loop or is already
|
|
// used (perhaps because there are multiple permutes with the same shuffle
|
|
// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
|
|
// the loop requires an extra register.
|
|
//
|
|
// As a compromise, we only emit discrete instructions if the shuffle can be
|
|
// generated in 3 or fewer operations. When we have loop information
|
|
// available, if this block is within a loop, we should avoid using vperm
|
|
// for 3-operation perms and use a constant pool load instead.
|
|
if (Cost < 3)
|
|
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
|
|
}
|
|
|
|
// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
|
|
// vector that will get spilled to the constant pool.
|
|
if (V2.isUndef()) V2 = V1;
|
|
|
|
// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
|
|
// that it is in input element units, not in bytes. Convert now.
|
|
|
|
// For little endian, the order of the input vectors is reversed, and
|
|
// the permutation mask is complemented with respect to 31. This is
|
|
// necessary to produce proper semantics with the big-endian-biased vperm
|
|
// instruction.
|
|
EVT EltVT = V1.getValueType().getVectorElementType();
|
|
unsigned BytesPerElement = EltVT.getSizeInBits()/8;
|
|
|
|
SmallVector<SDValue, 16> ResultMask;
|
|
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
|
|
unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
|
|
|
|
for (unsigned j = 0; j != BytesPerElement; ++j)
|
|
if (isLittleEndian)
|
|
ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j),
|
|
dl, MVT::i32));
|
|
else
|
|
ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl,
|
|
MVT::i32));
|
|
}
|
|
|
|
SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
|
|
if (isLittleEndian)
|
|
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
|
|
V2, V1, VPermMask);
|
|
else
|
|
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
|
|
V1, V2, VPermMask);
|
|
}
|
|
|
|
/// getVectorCompareInfo - Given an intrinsic, return false if it is not a
|
|
/// vector comparison. If it is, return true and fill in Opc/isDot with
|
|
/// information about the intrinsic.
|
|
static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
|
|
bool &isDot, const PPCSubtarget &Subtarget) {
|
|
unsigned IntrinsicID =
|
|
cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
|
|
CompareOpc = -1;
|
|
isDot = false;
|
|
switch (IntrinsicID) {
|
|
default:
|
|
return false;
|
|
// Comparison predicates.
|
|
case Intrinsic::ppc_altivec_vcmpbfp_p:
|
|
CompareOpc = 966;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpeqfp_p:
|
|
CompareOpc = 198;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequb_p:
|
|
CompareOpc = 6;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequh_p:
|
|
CompareOpc = 70;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequw_p:
|
|
CompareOpc = 134;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequd_p:
|
|
if (Subtarget.hasP8Altivec()) {
|
|
CompareOpc = 199;
|
|
isDot = true;
|
|
} else
|
|
return false;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpneb_p:
|
|
case Intrinsic::ppc_altivec_vcmpneh_p:
|
|
case Intrinsic::ppc_altivec_vcmpnew_p:
|
|
case Intrinsic::ppc_altivec_vcmpnezb_p:
|
|
case Intrinsic::ppc_altivec_vcmpnezh_p:
|
|
case Intrinsic::ppc_altivec_vcmpnezw_p:
|
|
if (Subtarget.hasP9Altivec()) {
|
|
switch (IntrinsicID) {
|
|
default:
|
|
llvm_unreachable("Unknown comparison intrinsic.");
|
|
case Intrinsic::ppc_altivec_vcmpneb_p:
|
|
CompareOpc = 7;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpneh_p:
|
|
CompareOpc = 71;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnew_p:
|
|
CompareOpc = 135;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnezb_p:
|
|
CompareOpc = 263;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnezh_p:
|
|
CompareOpc = 327;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnezw_p:
|
|
CompareOpc = 391;
|
|
break;
|
|
}
|
|
isDot = true;
|
|
} else
|
|
return false;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgefp_p:
|
|
CompareOpc = 454;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtfp_p:
|
|
CompareOpc = 710;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsb_p:
|
|
CompareOpc = 774;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsh_p:
|
|
CompareOpc = 838;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsw_p:
|
|
CompareOpc = 902;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsd_p:
|
|
if (Subtarget.hasP8Altivec()) {
|
|
CompareOpc = 967;
|
|
isDot = true;
|
|
} else
|
|
return false;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtub_p:
|
|
CompareOpc = 518;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtuh_p:
|
|
CompareOpc = 582;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtuw_p:
|
|
CompareOpc = 646;
|
|
isDot = true;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtud_p:
|
|
if (Subtarget.hasP8Altivec()) {
|
|
CompareOpc = 711;
|
|
isDot = true;
|
|
} else
|
|
return false;
|
|
break;
|
|
|
|
// VSX predicate comparisons use the same infrastructure
|
|
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
|
|
case Intrinsic::ppc_vsx_xvcmpgedp_p:
|
|
case Intrinsic::ppc_vsx_xvcmpgtdp_p:
|
|
case Intrinsic::ppc_vsx_xvcmpeqsp_p:
|
|
case Intrinsic::ppc_vsx_xvcmpgesp_p:
|
|
case Intrinsic::ppc_vsx_xvcmpgtsp_p:
|
|
if (Subtarget.hasVSX()) {
|
|
switch (IntrinsicID) {
|
|
case Intrinsic::ppc_vsx_xvcmpeqdp_p:
|
|
CompareOpc = 99;
|
|
break;
|
|
case Intrinsic::ppc_vsx_xvcmpgedp_p:
|
|
CompareOpc = 115;
|
|
break;
|
|
case Intrinsic::ppc_vsx_xvcmpgtdp_p:
|
|
CompareOpc = 107;
|
|
break;
|
|
case Intrinsic::ppc_vsx_xvcmpeqsp_p:
|
|
CompareOpc = 67;
|
|
break;
|
|
case Intrinsic::ppc_vsx_xvcmpgesp_p:
|
|
CompareOpc = 83;
|
|
break;
|
|
case Intrinsic::ppc_vsx_xvcmpgtsp_p:
|
|
CompareOpc = 75;
|
|
break;
|
|
}
|
|
isDot = true;
|
|
} else
|
|
return false;
|
|
break;
|
|
|
|
// Normal Comparisons.
|
|
case Intrinsic::ppc_altivec_vcmpbfp:
|
|
CompareOpc = 966;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpeqfp:
|
|
CompareOpc = 198;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequb:
|
|
CompareOpc = 6;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequh:
|
|
CompareOpc = 70;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequw:
|
|
CompareOpc = 134;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpequd:
|
|
if (Subtarget.hasP8Altivec())
|
|
CompareOpc = 199;
|
|
else
|
|
return false;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpneb:
|
|
case Intrinsic::ppc_altivec_vcmpneh:
|
|
case Intrinsic::ppc_altivec_vcmpnew:
|
|
case Intrinsic::ppc_altivec_vcmpnezb:
|
|
case Intrinsic::ppc_altivec_vcmpnezh:
|
|
case Intrinsic::ppc_altivec_vcmpnezw:
|
|
if (Subtarget.hasP9Altivec())
|
|
switch (IntrinsicID) {
|
|
default:
|
|
llvm_unreachable("Unknown comparison intrinsic.");
|
|
case Intrinsic::ppc_altivec_vcmpneb:
|
|
CompareOpc = 7;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpneh:
|
|
CompareOpc = 71;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnew:
|
|
CompareOpc = 135;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnezb:
|
|
CompareOpc = 263;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnezh:
|
|
CompareOpc = 327;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpnezw:
|
|
CompareOpc = 391;
|
|
break;
|
|
}
|
|
else
|
|
return false;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgefp:
|
|
CompareOpc = 454;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtfp:
|
|
CompareOpc = 710;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsb:
|
|
CompareOpc = 774;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsh:
|
|
CompareOpc = 838;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsw:
|
|
CompareOpc = 902;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtsd:
|
|
if (Subtarget.hasP8Altivec())
|
|
CompareOpc = 967;
|
|
else
|
|
return false;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtub:
|
|
CompareOpc = 518;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtuh:
|
|
CompareOpc = 582;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtuw:
|
|
CompareOpc = 646;
|
|
break;
|
|
case Intrinsic::ppc_altivec_vcmpgtud:
|
|
if (Subtarget.hasP8Altivec())
|
|
CompareOpc = 711;
|
|
else
|
|
return false;
|
|
break;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
|
|
/// lower, do it, otherwise return null.
|
|
SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
unsigned IntrinsicID =
|
|
cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
|
|
SDLoc dl(Op);
|
|
|
|
if (IntrinsicID == Intrinsic::thread_pointer) {
|
|
// Reads the thread pointer register, used for __builtin_thread_pointer.
|
|
if (Subtarget.isPPC64())
|
|
return DAG.getRegister(PPC::X13, MVT::i64);
|
|
return DAG.getRegister(PPC::R2, MVT::i32);
|
|
}
|
|
|
|
// If this is a lowered altivec predicate compare, CompareOpc is set to the
|
|
// opcode number of the comparison.
|
|
int CompareOpc;
|
|
bool isDot;
|
|
if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
|
|
return SDValue(); // Don't custom lower most intrinsics.
|
|
|
|
// If this is a non-dot comparison, make the VCMP node and we are done.
|
|
if (!isDot) {
|
|
SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
|
|
Op.getOperand(1), Op.getOperand(2),
|
|
DAG.getConstant(CompareOpc, dl, MVT::i32));
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
|
|
}
|
|
|
|
// Create the PPCISD altivec 'dot' comparison node.
|
|
SDValue Ops[] = {
|
|
Op.getOperand(2), // LHS
|
|
Op.getOperand(3), // RHS
|
|
DAG.getConstant(CompareOpc, dl, MVT::i32)
|
|
};
|
|
EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
|
|
SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
|
|
|
|
// Now that we have the comparison, emit a copy from the CR to a GPR.
|
|
// This is flagged to the above dot comparison.
|
|
SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
|
|
DAG.getRegister(PPC::CR6, MVT::i32),
|
|
CompNode.getValue(1));
|
|
|
|
// Unpack the result based on how the target uses it.
|
|
unsigned BitNo; // Bit # of CR6.
|
|
bool InvertBit; // Invert result?
|
|
switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
|
|
default: // Can't happen, don't crash on invalid number though.
|
|
case 0: // Return the value of the EQ bit of CR6.
|
|
BitNo = 0; InvertBit = false;
|
|
break;
|
|
case 1: // Return the inverted value of the EQ bit of CR6.
|
|
BitNo = 0; InvertBit = true;
|
|
break;
|
|
case 2: // Return the value of the LT bit of CR6.
|
|
BitNo = 2; InvertBit = false;
|
|
break;
|
|
case 3: // Return the inverted value of the LT bit of CR6.
|
|
BitNo = 2; InvertBit = true;
|
|
break;
|
|
}
|
|
|
|
// Shift the bit into the low position.
|
|
Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
|
|
DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
|
|
// Isolate the bit.
|
|
Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
|
|
DAG.getConstant(1, dl, MVT::i32));
|
|
|
|
// If we are supposed to, toggle the bit.
|
|
if (InvertBit)
|
|
Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
|
|
DAG.getConstant(1, dl, MVT::i32));
|
|
return Flags;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
// SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
|
|
// the beginning of the argument list.
|
|
int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
|
|
SDLoc DL(Op);
|
|
switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
|
|
case Intrinsic::ppc_cfence: {
|
|
assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
|
|
assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
|
|
return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
|
|
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
|
|
Op.getOperand(ArgStart + 1)),
|
|
Op.getOperand(0)),
|
|
0);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
|
|
// Check for a DIV with the same operands as this REM.
|
|
for (auto UI : Op.getOperand(1)->uses()) {
|
|
if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||
|
|
(Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
|
|
if (UI->getOperand(0) == Op.getOperand(0) &&
|
|
UI->getOperand(1) == Op.getOperand(1))
|
|
return SDValue();
|
|
}
|
|
return Op;
|
|
}
|
|
|
|
// Lower scalar BSWAP64 to xxbrd.
|
|
SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
// MTVSRDD
|
|
Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
|
|
Op.getOperand(0));
|
|
// XXBRD
|
|
Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);
|
|
// MFVSRD
|
|
int VectorIndex = 0;
|
|
if (Subtarget.isLittleEndian())
|
|
VectorIndex = 1;
|
|
Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
|
|
DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
|
|
return Op;
|
|
}
|
|
|
|
// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
|
|
// compared to a value that is atomically loaded (atomic loads zero-extend).
|
|
SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
|
|
"Expecting an atomic compare-and-swap here.");
|
|
SDLoc dl(Op);
|
|
auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
|
|
EVT MemVT = AtomicNode->getMemoryVT();
|
|
if (MemVT.getSizeInBits() >= 32)
|
|
return Op;
|
|
|
|
SDValue CmpOp = Op.getOperand(2);
|
|
// If this is already correctly zero-extended, leave it alone.
|
|
auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
|
|
if (DAG.MaskedValueIsZero(CmpOp, HighBits))
|
|
return Op;
|
|
|
|
// Clear the high bits of the compare operand.
|
|
unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
|
|
SDValue NewCmpOp =
|
|
DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
|
|
DAG.getConstant(MaskVal, dl, MVT::i32));
|
|
|
|
// Replace the existing compare operand with the properly zero-extended one.
|
|
SmallVector<SDValue, 4> Ops;
|
|
for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
|
|
Ops.push_back(AtomicNode->getOperand(i));
|
|
Ops[2] = NewCmpOp;
|
|
MachineMemOperand *MMO = AtomicNode->getMemOperand();
|
|
SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
|
|
auto NodeTy =
|
|
(MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
|
|
return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
// Create a stack slot that is 16-byte aligned.
|
|
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
|
|
int FrameIdx = MFI.CreateStackObject(16, 16, false);
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
// Store the input value into Value#0 of the stack slot.
|
|
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
|
|
MachinePointerInfo());
|
|
// Load it out.
|
|
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
|
|
"Should only be called for ISD::INSERT_VECTOR_ELT");
|
|
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
|
|
// We have legal lowering for constant indices but not for variable ones.
|
|
if (!C)
|
|
return SDValue();
|
|
|
|
EVT VT = Op.getValueType();
|
|
SDLoc dl(Op);
|
|
SDValue V1 = Op.getOperand(0);
|
|
SDValue V2 = Op.getOperand(1);
|
|
// We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
|
|
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
|
|
SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
|
|
unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
|
|
unsigned InsertAtElement = C->getZExtValue();
|
|
unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
|
|
if (Subtarget.isLittleEndian()) {
|
|
InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
|
|
}
|
|
return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
|
|
DAG.getConstant(InsertAtByte, dl, MVT::i32));
|
|
}
|
|
return Op;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
SDNode *N = Op.getNode();
|
|
|
|
assert(N->getOperand(0).getValueType() == MVT::v4i1 &&
|
|
"Unknown extract_vector_elt type");
|
|
|
|
SDValue Value = N->getOperand(0);
|
|
|
|
// The first part of this is like the store lowering except that we don't
|
|
// need to track the chain.
|
|
|
|
// The values are now known to be -1 (false) or 1 (true). To convert this
|
|
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
|
|
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
|
|
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
|
|
|
|
// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
|
|
// understand how to form the extending load.
|
|
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
|
|
|
|
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
|
|
|
|
// Now convert to an integer and store.
|
|
Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
|
|
DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
|
|
Value);
|
|
|
|
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
|
|
int FrameIdx = MFI.CreateStackObject(16, 16, false);
|
|
MachinePointerInfo PtrInfo =
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
SDValue StoreChain = DAG.getEntryNode();
|
|
SDValue Ops[] = {StoreChain,
|
|
DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
|
|
Value, FIdx};
|
|
SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
|
|
|
|
StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
|
|
dl, VTs, Ops, MVT::v4i32, PtrInfo);
|
|
|
|
// Extract the value requested.
|
|
unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
|
SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
|
|
Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
|
|
|
|
SDValue IntVal =
|
|
DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset));
|
|
|
|
if (!Subtarget.useCRBits())
|
|
return IntVal;
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal);
|
|
}
|
|
|
|
/// Lowering for QPX v4i1 loads
|
|
SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
|
|
SDValue LoadChain = LN->getChain();
|
|
SDValue BasePtr = LN->getBasePtr();
|
|
|
|
if (Op.getValueType() == MVT::v4f64 ||
|
|
Op.getValueType() == MVT::v4f32) {
|
|
EVT MemVT = LN->getMemoryVT();
|
|
unsigned Alignment = LN->getAlignment();
|
|
|
|
// If this load is properly aligned, then it is legal.
|
|
if (Alignment >= MemVT.getStoreSize())
|
|
return Op;
|
|
|
|
EVT ScalarVT = Op.getValueType().getScalarType(),
|
|
ScalarMemVT = MemVT.getScalarType();
|
|
unsigned Stride = ScalarMemVT.getStoreSize();
|
|
|
|
SDValue Vals[4], LoadChains[4];
|
|
for (unsigned Idx = 0; Idx < 4; ++Idx) {
|
|
SDValue Load;
|
|
if (ScalarVT != ScalarMemVT)
|
|
Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain,
|
|
BasePtr,
|
|
LN->getPointerInfo().getWithOffset(Idx * Stride),
|
|
ScalarMemVT, MinAlign(Alignment, Idx * Stride),
|
|
LN->getMemOperand()->getFlags(), LN->getAAInfo());
|
|
else
|
|
Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr,
|
|
LN->getPointerInfo().getWithOffset(Idx * Stride),
|
|
MinAlign(Alignment, Idx * Stride),
|
|
LN->getMemOperand()->getFlags(), LN->getAAInfo());
|
|
|
|
if (Idx == 0 && LN->isIndexed()) {
|
|
assert(LN->getAddressingMode() == ISD::PRE_INC &&
|
|
"Unknown addressing mode on vector load");
|
|
Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(),
|
|
LN->getAddressingMode());
|
|
}
|
|
|
|
Vals[Idx] = Load;
|
|
LoadChains[Idx] = Load.getValue(1);
|
|
|
|
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
|
|
DAG.getConstant(Stride, dl,
|
|
BasePtr.getValueType()));
|
|
}
|
|
|
|
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
|
|
SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals);
|
|
|
|
if (LN->isIndexed()) {
|
|
SDValue RetOps[] = { Value, Vals[0].getValue(1), TF };
|
|
return DAG.getMergeValues(RetOps, dl);
|
|
}
|
|
|
|
SDValue RetOps[] = { Value, TF };
|
|
return DAG.getMergeValues(RetOps, dl);
|
|
}
|
|
|
|
assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower");
|
|
assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported");
|
|
|
|
// To lower v4i1 from a byte array, we load the byte elements of the
|
|
// vector and then reuse the BUILD_VECTOR logic.
|
|
|
|
SDValue VectElmts[4], VectElmtChains[4];
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
|
|
Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
|
|
|
|
VectElmts[i] = DAG.getExtLoad(
|
|
ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx,
|
|
LN->getPointerInfo().getWithOffset(i), MVT::i8,
|
|
/* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo());
|
|
VectElmtChains[i] = VectElmts[i].getValue(1);
|
|
}
|
|
|
|
LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains);
|
|
SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts);
|
|
|
|
SDValue RVals[] = { Value, LoadChain };
|
|
return DAG.getMergeValues(RVals, dl);
|
|
}
|
|
|
|
/// Lowering for QPX v4i1 stores
|
|
SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
|
|
SDValue StoreChain = SN->getChain();
|
|
SDValue BasePtr = SN->getBasePtr();
|
|
SDValue Value = SN->getValue();
|
|
|
|
if (Value.getValueType() == MVT::v4f64 ||
|
|
Value.getValueType() == MVT::v4f32) {
|
|
EVT MemVT = SN->getMemoryVT();
|
|
unsigned Alignment = SN->getAlignment();
|
|
|
|
// If this store is properly aligned, then it is legal.
|
|
if (Alignment >= MemVT.getStoreSize())
|
|
return Op;
|
|
|
|
EVT ScalarVT = Value.getValueType().getScalarType(),
|
|
ScalarMemVT = MemVT.getScalarType();
|
|
unsigned Stride = ScalarMemVT.getStoreSize();
|
|
|
|
SDValue Stores[4];
|
|
for (unsigned Idx = 0; Idx < 4; ++Idx) {
|
|
SDValue Ex = DAG.getNode(
|
|
ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
|
|
DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
|
|
SDValue Store;
|
|
if (ScalarVT != ScalarMemVT)
|
|
Store =
|
|
DAG.getTruncStore(StoreChain, dl, Ex, BasePtr,
|
|
SN->getPointerInfo().getWithOffset(Idx * Stride),
|
|
ScalarMemVT, MinAlign(Alignment, Idx * Stride),
|
|
SN->getMemOperand()->getFlags(), SN->getAAInfo());
|
|
else
|
|
Store = DAG.getStore(StoreChain, dl, Ex, BasePtr,
|
|
SN->getPointerInfo().getWithOffset(Idx * Stride),
|
|
MinAlign(Alignment, Idx * Stride),
|
|
SN->getMemOperand()->getFlags(), SN->getAAInfo());
|
|
|
|
if (Idx == 0 && SN->isIndexed()) {
|
|
assert(SN->getAddressingMode() == ISD::PRE_INC &&
|
|
"Unknown addressing mode on vector store");
|
|
Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(),
|
|
SN->getAddressingMode());
|
|
}
|
|
|
|
BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
|
|
DAG.getConstant(Stride, dl,
|
|
BasePtr.getValueType()));
|
|
Stores[Idx] = Store;
|
|
}
|
|
|
|
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
|
|
|
|
if (SN->isIndexed()) {
|
|
SDValue RetOps[] = { TF, Stores[0].getValue(1) };
|
|
return DAG.getMergeValues(RetOps, dl);
|
|
}
|
|
|
|
return TF;
|
|
}
|
|
|
|
assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported");
|
|
assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower");
|
|
|
|
// The values are now known to be -1 (false) or 1 (true). To convert this
|
|
// into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5).
|
|
// This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5
|
|
Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value);
|
|
|
|
// FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to
|
|
// understand how to form the extending load.
|
|
SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64);
|
|
|
|
Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs);
|
|
|
|
// Now convert to an integer and store.
|
|
Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64,
|
|
DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32),
|
|
Value);
|
|
|
|
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
|
|
int FrameIdx = MFI.CreateStackObject(16, 16, false);
|
|
MachinePointerInfo PtrInfo =
|
|
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
|
|
EVT PtrVT = getPointerTy(DAG.getDataLayout());
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
SDValue Ops[] = {StoreChain,
|
|
DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32),
|
|
Value, FIdx};
|
|
SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other);
|
|
|
|
StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID,
|
|
dl, VTs, Ops, MVT::v4i32, PtrInfo);
|
|
|
|
// Move data into the byte array.
|
|
SDValue Loads[4], LoadChains[4];
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
unsigned Offset = 4*i;
|
|
SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType());
|
|
Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx);
|
|
|
|
Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx,
|
|
PtrInfo.getWithOffset(Offset));
|
|
LoadChains[i] = Loads[i].getValue(1);
|
|
}
|
|
|
|
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
|
|
|
|
SDValue Stores[4];
|
|
for (unsigned i = 0; i < 4; ++i) {
|
|
SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType());
|
|
Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx);
|
|
|
|
Stores[i] = DAG.getTruncStore(
|
|
StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i),
|
|
MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(),
|
|
SN->getAAInfo());
|
|
}
|
|
|
|
StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
|
|
|
|
return StoreChain;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
if (Op.getValueType() == MVT::v4i32) {
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
|
|
|
SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
|
|
SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
|
|
|
|
SDValue RHSSwap = // = vrlw RHS, 16
|
|
BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
|
|
|
|
// Shrinkify inputs to v8i16.
|
|
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
|
|
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
|
|
RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
|
|
|
|
// Low parts multiplied together, generating 32-bit results (we ignore the
|
|
// top parts).
|
|
SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
|
|
LHS, RHS, DAG, dl, MVT::v4i32);
|
|
|
|
SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
|
|
LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
|
|
// Shift the high parts up 16 bits.
|
|
HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
|
|
Neg16, DAG, dl);
|
|
return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
|
|
} else if (Op.getValueType() == MVT::v8i16) {
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
|
|
|
SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
|
|
|
|
return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
|
|
LHS, RHS, Zero, DAG, dl);
|
|
} else if (Op.getValueType() == MVT::v16i8) {
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
|
|
// Multiply the even 8-bit parts, producing 16-bit sums.
|
|
SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
|
|
LHS, RHS, DAG, dl, MVT::v8i16);
|
|
EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
|
|
|
|
// Multiply the odd 8-bit parts, producing 16-bit sums.
|
|
SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
|
|
LHS, RHS, DAG, dl, MVT::v8i16);
|
|
OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
|
|
|
|
// Merge the results together. Because vmuleub and vmuloub are
|
|
// instructions with a big-endian bias, we must reverse the
|
|
// element numbering and reverse the meaning of "odd" and "even"
|
|
// when generating little endian code.
|
|
int Ops[16];
|
|
for (unsigned i = 0; i != 8; ++i) {
|
|
if (isLittleEndian) {
|
|
Ops[i*2 ] = 2*i;
|
|
Ops[i*2+1] = 2*i+16;
|
|
} else {
|
|
Ops[i*2 ] = 2*i+1;
|
|
Ops[i*2+1] = 2*i+1+16;
|
|
}
|
|
}
|
|
if (isLittleEndian)
|
|
return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
|
|
else
|
|
return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
|
|
} else {
|
|
llvm_unreachable("Unknown mul to lower!");
|
|
}
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");
|
|
|
|
EVT VT = Op.getValueType();
|
|
assert(VT.isVector() &&
|
|
"Only set vector abs as custom, scalar abs shouldn't reach here!");
|
|
assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
|
|
VT == MVT::v16i8) &&
|
|
"Unexpected vector element type!");
|
|
assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
|
|
"Current subtarget doesn't support smax v2i64!");
|
|
|
|
// For vector abs, it can be lowered to:
|
|
// abs x
|
|
// ==>
|
|
// y = -x
|
|
// smax(x, y)
|
|
|
|
SDLoc dl(Op);
|
|
SDValue X = Op.getOperand(0);
|
|
SDValue Zero = DAG.getConstant(0, dl, VT);
|
|
SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);
|
|
|
|
// SMAX patch https://reviews.llvm.org/D47332
|
|
// hasn't landed yet, so use intrinsic first here.
|
|
// TODO: Should use SMAX directly once SMAX patch landed
|
|
Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
|
|
if (VT == MVT::v2i64)
|
|
BifID = Intrinsic::ppc_altivec_vmaxsd;
|
|
else if (VT == MVT::v8i16)
|
|
BifID = Intrinsic::ppc_altivec_vmaxsh;
|
|
else if (VT == MVT::v16i8)
|
|
BifID = Intrinsic::ppc_altivec_vmaxsb;
|
|
|
|
return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
|
|
}
|
|
|
|
// Custom lowering for fpext vf32 to v2f64
|
|
SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
assert(Op.getOpcode() == ISD::FP_EXTEND &&
|
|
"Should only be called for ISD::FP_EXTEND");
|
|
|
|
// We only want to custom lower an extend from v2f32 to v2f64.
|
|
if (Op.getValueType() != MVT::v2f64 ||
|
|
Op.getOperand(0).getValueType() != MVT::v2f32)
|
|
return SDValue();
|
|
|
|
SDLoc dl(Op);
|
|
SDValue Op0 = Op.getOperand(0);
|
|
|
|
switch (Op0.getOpcode()) {
|
|
default:
|
|
return SDValue();
|
|
case ISD::EXTRACT_SUBVECTOR: {
|
|
assert(Op0.getNumOperands() == 2 &&
|
|
isa<ConstantSDNode>(Op0->getOperand(1)) &&
|
|
"Node should have 2 operands with second one being a constant!");
|
|
|
|
if (Op0.getOperand(0).getValueType() != MVT::v4f32)
|
|
return SDValue();
|
|
|
|
// Custom lower is only done for high or low doubleword.
|
|
int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
|
|
if (Idx % 2 != 0)
|
|
return SDValue();
|
|
|
|
// Since input is v4f32, at this point Idx is either 0 or 2.
|
|
// Shift to get the doubleword position we want.
|
|
int DWord = Idx >> 1;
|
|
|
|
// High and low word positions are different on little endian.
|
|
if (Subtarget.isLittleEndian())
|
|
DWord ^= 0x1;
|
|
|
|
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
|
|
Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
|
|
}
|
|
case ISD::FADD:
|
|
case ISD::FMUL:
|
|
case ISD::FSUB: {
|
|
SDValue NewLoad[2];
|
|
for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
|
|
// Ensure both input are loads.
|
|
SDValue LdOp = Op0.getOperand(i);
|
|
if (LdOp.getOpcode() != ISD::LOAD)
|
|
return SDValue();
|
|
// Generate new load node.
|
|
LoadSDNode *LD = cast<LoadSDNode>(LdOp);
|
|
SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
|
|
NewLoad[i] = DAG.getMemIntrinsicNode(
|
|
PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
|
|
LD->getMemoryVT(), LD->getMemOperand());
|
|
}
|
|
SDValue NewOp =
|
|
DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
|
|
NewLoad[1], Op0.getNode()->getFlags());
|
|
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
|
|
DAG.getConstant(0, dl, MVT::i32));
|
|
}
|
|
case ISD::LOAD: {
|
|
LoadSDNode *LD = cast<LoadSDNode>(Op0);
|
|
SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
|
|
SDValue NewLd = DAG.getMemIntrinsicNode(
|
|
PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
|
|
LD->getMemoryVT(), LD->getMemOperand());
|
|
return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
|
|
DAG.getConstant(0, dl, MVT::i32));
|
|
}
|
|
}
|
|
llvm_unreachable("ERROR:Should return for all cases within swtich.");
|
|
}
|
|
|
|
/// LowerOperation - Provide custom lowering hooks for some operations.
|
|
///
|
|
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|
switch (Op.getOpcode()) {
|
|
default: llvm_unreachable("Wasn't expecting to be able to lower this!");
|
|
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
|
|
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
|
|
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
|
|
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
|
|
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
|
|
case ISD::SETCC: return LowerSETCC(Op, DAG);
|
|
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
|
|
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
|
|
|
|
// Variable argument lowering.
|
|
case ISD::VASTART: return LowerVASTART(Op, DAG);
|
|
case ISD::VAARG: return LowerVAARG(Op, DAG);
|
|
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
|
|
|
|
case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
|
|
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
|
|
case ISD::GET_DYNAMIC_AREA_OFFSET:
|
|
return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
|
|
|
|
// Exception handling lowering.
|
|
case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
|
|
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
|
|
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
|
|
|
|
case ISD::LOAD: return LowerLOAD(Op, DAG);
|
|
case ISD::STORE: return LowerSTORE(Op, DAG);
|
|
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
|
|
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
|
case ISD::FP_TO_UINT:
|
|
case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
|
|
case ISD::UINT_TO_FP:
|
|
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
|
|
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
|
|
|
|
// Lower 64-bit shifts.
|
|
case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
|
|
case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
|
|
case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
|
|
|
|
// Vector-related lowering.
|
|
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
|
|
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
|
|
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
|
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
|
|
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
|
|
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
|
|
case ISD::MUL: return LowerMUL(Op, DAG);
|
|
case ISD::ABS: return LowerABS(Op, DAG);
|
|
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
|
|
|
|
// For counter-based loop handling.
|
|
case ISD::INTRINSIC_W_CHAIN: return SDValue();
|
|
|
|
case ISD::BITCAST: return LowerBITCAST(Op, DAG);
|
|
|
|
// Frame & Return address.
|
|
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
|
|
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
|
|
|
|
case ISD::INTRINSIC_VOID:
|
|
return LowerINTRINSIC_VOID(Op, DAG);
|
|
case ISD::SREM:
|
|
case ISD::UREM:
|
|
return LowerREM(Op, DAG);
|
|
case ISD::BSWAP:
|
|
return LowerBSWAP(Op, DAG);
|
|
case ISD::ATOMIC_CMP_SWAP:
|
|
return LowerATOMIC_CMP_SWAP(Op, DAG);
|
|
}
|
|
}
|
|
|
|
void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
|
|
SmallVectorImpl<SDValue>&Results,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(N);
|
|
switch (N->getOpcode()) {
|
|
default:
|
|
llvm_unreachable("Do not know how to custom type legalize this operation!");
|
|
case ISD::READCYCLECOUNTER: {
|
|
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
|
|
SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
|
|
|
|
Results.push_back(RTB);
|
|
Results.push_back(RTB.getValue(1));
|
|
Results.push_back(RTB.getValue(2));
|
|
break;
|
|
}
|
|
case ISD::INTRINSIC_W_CHAIN: {
|
|
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
|
|
Intrinsic::loop_decrement)
|
|
break;
|
|
|
|
assert(N->getValueType(0) == MVT::i1 &&
|
|
"Unexpected result type for CTR decrement intrinsic");
|
|
EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
|
|
N->getValueType(0));
|
|
SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
|
|
SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
|
|
N->getOperand(1));
|
|
|
|
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
|
|
Results.push_back(NewInt.getValue(1));
|
|
break;
|
|
}
|
|
case ISD::VAARG: {
|
|
if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
|
|
return;
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
if (VT == MVT::i64) {
|
|
SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
|
|
|
|
Results.push_back(NewNode);
|
|
Results.push_back(NewNode.getValue(1));
|
|
}
|
|
return;
|
|
}
|
|
case ISD::FP_TO_SINT:
|
|
case ISD::FP_TO_UINT:
|
|
// LowerFP_TO_INT() can only handle f32 and f64.
|
|
if (N->getOperand(0).getValueType() == MVT::ppcf128)
|
|
return;
|
|
Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
|
|
return;
|
|
case ISD::TRUNCATE: {
|
|
EVT TrgVT = N->getValueType(0);
|
|
EVT OpVT = N->getOperand(0).getValueType();
|
|
if (TrgVT.isVector() &&
|
|
isOperationCustom(N->getOpcode(), TrgVT) &&
|
|
OpVT.getSizeInBits() <= 128 &&
|
|
isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits()))
|
|
Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG));
|
|
return;
|
|
}
|
|
case ISD::BITCAST:
|
|
// Don't handle bitcast here.
|
|
return;
|
|
}
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Other Lowering Code
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
Function *Func = Intrinsic::getDeclaration(M, Id);
|
|
return Builder.CreateCall(Func, {});
|
|
}
|
|
|
|
// The mappings for emitLeading/TrailingFence is taken from
|
|
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
|
|
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
|
|
Instruction *Inst,
|
|
AtomicOrdering Ord) const {
|
|
if (Ord == AtomicOrdering::SequentiallyConsistent)
|
|
return callIntrinsic(Builder, Intrinsic::ppc_sync);
|
|
if (isReleaseOrStronger(Ord))
|
|
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
|
|
return nullptr;
|
|
}
|
|
|
|
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
|
|
Instruction *Inst,
|
|
AtomicOrdering Ord) const {
|
|
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
|
|
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
|
|
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
|
|
// and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
|
|
if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
|
|
return Builder.CreateCall(
|
|
Intrinsic::getDeclaration(
|
|
Builder.GetInsertBlock()->getParent()->getParent(),
|
|
Intrinsic::ppc_cfence, {Inst->getType()}),
|
|
{Inst});
|
|
// FIXME: Can use isync for rmw operation.
|
|
return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
MachineBasicBlock *
|
|
PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
|
|
unsigned AtomicSize,
|
|
unsigned BinOpcode,
|
|
unsigned CmpOpcode,
|
|
unsigned CmpPred) const {
|
|
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
|
|
auto LoadMnemonic = PPC::LDARX;
|
|
auto StoreMnemonic = PPC::STDCX;
|
|
switch (AtomicSize) {
|
|
default:
|
|
llvm_unreachable("Unexpected size of atomic entity");
|
|
case 1:
|
|
LoadMnemonic = PPC::LBARX;
|
|
StoreMnemonic = PPC::STBCX;
|
|
assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
|
|
break;
|
|
case 2:
|
|
LoadMnemonic = PPC::LHARX;
|
|
StoreMnemonic = PPC::STHCX;
|
|
assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
|
|
break;
|
|
case 4:
|
|
LoadMnemonic = PPC::LWARX;
|
|
StoreMnemonic = PPC::STWCX;
|
|
break;
|
|
case 8:
|
|
LoadMnemonic = PPC::LDARX;
|
|
StoreMnemonic = PPC::STDCX;
|
|
break;
|
|
}
|
|
|
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
|
MachineFunction *F = BB->getParent();
|
|
MachineFunction::iterator It = ++BB->getIterator();
|
|
|
|
Register dest = MI.getOperand(0).getReg();
|
|
Register ptrA = MI.getOperand(1).getReg();
|
|
Register ptrB = MI.getOperand(2).getReg();
|
|
Register incr = MI.getOperand(3).getReg();
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *loop2MBB =
|
|
CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
F->insert(It, loopMBB);
|
|
if (CmpOpcode)
|
|
F->insert(It, loop2MBB);
|
|
F->insert(It, exitMBB);
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
Register TmpReg = (!BinOpcode) ? incr :
|
|
RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
|
|
: &PPC::GPRCRegClass);
|
|
|
|
// thisMBB:
|
|
// ...
|
|
// fallthrough --> loopMBB
|
|
BB->addSuccessor(loopMBB);
|
|
|
|
// loopMBB:
|
|
// l[wd]arx dest, ptr
|
|
// add r0, dest, incr
|
|
// st[wd]cx. r0, ptr
|
|
// bne- loopMBB
|
|
// fallthrough --> exitMBB
|
|
|
|
// For max/min...
|
|
// loopMBB:
|
|
// l[wd]arx dest, ptr
|
|
// cmpl?[wd] incr, dest
|
|
// bgt exitMBB
|
|
// loop2MBB:
|
|
// st[wd]cx. dest, ptr
|
|
// bne- loopMBB
|
|
// fallthrough --> exitMBB
|
|
|
|
BB = loopMBB;
|
|
BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
|
|
.addReg(ptrA).addReg(ptrB);
|
|
if (BinOpcode)
|
|
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
|
|
if (CmpOpcode) {
|
|
// Signed comparisons of byte or halfword values must be sign-extended.
|
|
if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
|
|
Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
|
|
BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
|
|
ExtReg).addReg(dest);
|
|
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
|
|
.addReg(incr).addReg(ExtReg);
|
|
} else
|
|
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
|
|
.addReg(incr).addReg(dest);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB);
|
|
BB->addSuccessor(loop2MBB);
|
|
BB->addSuccessor(exitMBB);
|
|
BB = loop2MBB;
|
|
}
|
|
BuildMI(BB, dl, TII->get(StoreMnemonic))
|
|
.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
|
|
BB->addSuccessor(loopMBB);
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
// exitMBB:
|
|
// ...
|
|
BB = exitMBB;
|
|
return BB;
|
|
}
|
|
|
|
MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
|
|
MachineInstr &MI, MachineBasicBlock *BB,
|
|
bool is8bit, // operation
|
|
unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
|
|
// If we support part-word atomic mnemonics, just use them
|
|
if (Subtarget.hasPartwordAtomics())
|
|
return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
|
|
CmpPred);
|
|
|
|
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
// In 64 bit mode we have to use 64 bits for addresses, even though the
|
|
// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
|
|
// registers without caring whether they're 32 or 64, but here we're
|
|
// doing actual arithmetic on the addresses.
|
|
bool is64bit = Subtarget.isPPC64();
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
|
|
|
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
|
MachineFunction *F = BB->getParent();
|
|
MachineFunction::iterator It = ++BB->getIterator();
|
|
|
|
Register dest = MI.getOperand(0).getReg();
|
|
Register ptrA = MI.getOperand(1).getReg();
|
|
Register ptrB = MI.getOperand(2).getReg();
|
|
Register incr = MI.getOperand(3).getReg();
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *loop2MBB =
|
|
CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
F->insert(It, loopMBB);
|
|
if (CmpOpcode)
|
|
F->insert(It, loop2MBB);
|
|
F->insert(It, exitMBB);
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
const TargetRegisterClass *RC =
|
|
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
|
|
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
|
|
|
|
Register PtrReg = RegInfo.createVirtualRegister(RC);
|
|
Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register ShiftReg =
|
|
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
|
|
Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register MaskReg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Ptr1Reg;
|
|
Register TmpReg =
|
|
(!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
|
|
|
|
// thisMBB:
|
|
// ...
|
|
// fallthrough --> loopMBB
|
|
BB->addSuccessor(loopMBB);
|
|
|
|
// The 4-byte load must be aligned, while a char or short may be
|
|
// anywhere in the word. Hence all this nasty bookkeeping code.
|
|
// add ptr1, ptrA, ptrB [copy if ptrA==0]
|
|
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
|
|
// xori shift, shift1, 24 [16]
|
|
// rlwinm ptr, ptr1, 0, 0, 29
|
|
// slw incr2, incr, shift
|
|
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
|
|
// slw mask, mask2, shift
|
|
// loopMBB:
|
|
// lwarx tmpDest, ptr
|
|
// add tmp, tmpDest, incr2
|
|
// andc tmp2, tmpDest, mask
|
|
// and tmp3, tmp, mask
|
|
// or tmp4, tmp3, tmp2
|
|
// stwcx. tmp4, ptr
|
|
// bne- loopMBB
|
|
// fallthrough --> exitMBB
|
|
// srw dest, tmpDest, shift
|
|
if (ptrA != ZeroReg) {
|
|
Ptr1Reg = RegInfo.createVirtualRegister(RC);
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
|
|
.addReg(ptrA)
|
|
.addReg(ptrB);
|
|
} else {
|
|
Ptr1Reg = ptrB;
|
|
}
|
|
// We need use 32-bit subregister to avoid mismatch register class in 64-bit
|
|
// mode.
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
|
|
.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
|
|
.addImm(3)
|
|
.addImm(27)
|
|
.addImm(is8bit ? 28 : 27);
|
|
if (!isLittleEndian)
|
|
BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
|
|
.addReg(Shift1Reg)
|
|
.addImm(is8bit ? 24 : 16);
|
|
if (is64bit)
|
|
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
|
|
.addReg(Ptr1Reg)
|
|
.addImm(0)
|
|
.addImm(61);
|
|
else
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
|
|
.addReg(Ptr1Reg)
|
|
.addImm(0)
|
|
.addImm(0)
|
|
.addImm(29);
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);
|
|
if (is8bit)
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
|
|
else {
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
|
|
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
|
|
.addReg(Mask3Reg)
|
|
.addImm(65535);
|
|
}
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
|
|
.addReg(Mask2Reg)
|
|
.addReg(ShiftReg);
|
|
|
|
BB = loopMBB;
|
|
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
|
|
.addReg(ZeroReg)
|
|
.addReg(PtrReg);
|
|
if (BinOpcode)
|
|
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
|
|
.addReg(Incr2Reg)
|
|
.addReg(TmpDestReg);
|
|
BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
|
|
.addReg(TmpDestReg)
|
|
.addReg(MaskReg);
|
|
BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);
|
|
if (CmpOpcode) {
|
|
// For unsigned comparisons, we can directly compare the shifted values.
|
|
// For signed comparisons we shift and sign extend.
|
|
Register SReg = RegInfo.createVirtualRegister(GPRC);
|
|
BuildMI(BB, dl, TII->get(PPC::AND), SReg)
|
|
.addReg(TmpDestReg)
|
|
.addReg(MaskReg);
|
|
unsigned ValueReg = SReg;
|
|
unsigned CmpReg = Incr2Reg;
|
|
if (CmpOpcode == PPC::CMPW) {
|
|
ValueReg = RegInfo.createVirtualRegister(GPRC);
|
|
BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
|
|
.addReg(SReg)
|
|
.addReg(ShiftReg);
|
|
Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
|
|
BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
|
|
.addReg(ValueReg);
|
|
ValueReg = ValueSReg;
|
|
CmpReg = incr;
|
|
}
|
|
BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0)
|
|
.addReg(CmpReg)
|
|
.addReg(ValueReg);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(CmpPred)
|
|
.addReg(PPC::CR0)
|
|
.addMBB(exitMBB);
|
|
BB->addSuccessor(loop2MBB);
|
|
BB->addSuccessor(exitMBB);
|
|
BB = loop2MBB;
|
|
}
|
|
BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);
|
|
BuildMI(BB, dl, TII->get(PPC::STWCX))
|
|
.addReg(Tmp4Reg)
|
|
.addReg(ZeroReg)
|
|
.addReg(PtrReg);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE)
|
|
.addReg(PPC::CR0)
|
|
.addMBB(loopMBB);
|
|
BB->addSuccessor(loopMBB);
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
// exitMBB:
|
|
// ...
|
|
BB = exitMBB;
|
|
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
|
|
.addReg(TmpDestReg)
|
|
.addReg(ShiftReg);
|
|
return BB;
|
|
}
|
|
|
|
llvm::MachineBasicBlock *
|
|
PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
|
|
MachineBasicBlock *MBB) const {
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
|
|
|
|
MachineFunction *MF = MBB->getParent();
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
const BasicBlock *BB = MBB->getBasicBlock();
|
|
MachineFunction::iterator I = ++MBB->getIterator();
|
|
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
|
|
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
|
|
Register mainDstReg = MRI.createVirtualRegister(RC);
|
|
Register restoreDstReg = MRI.createVirtualRegister(RC);
|
|
|
|
MVT PVT = getPointerTy(MF->getDataLayout());
|
|
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
|
|
"Invalid Pointer Size!");
|
|
// For v = setjmp(buf), we generate
|
|
//
|
|
// thisMBB:
|
|
// SjLjSetup mainMBB
|
|
// bl mainMBB
|
|
// v_restore = 1
|
|
// b sinkMBB
|
|
//
|
|
// mainMBB:
|
|
// buf[LabelOffset] = LR
|
|
// v_main = 0
|
|
//
|
|
// sinkMBB:
|
|
// v = phi(main, restore)
|
|
//
|
|
|
|
MachineBasicBlock *thisMBB = MBB;
|
|
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
|
|
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
|
|
MF->insert(I, mainMBB);
|
|
MF->insert(I, sinkMBB);
|
|
|
|
MachineInstrBuilder MIB;
|
|
|
|
// Transfer the remainder of BB and its successor edges to sinkMBB.
|
|
sinkMBB->splice(sinkMBB->begin(), MBB,
|
|
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
|
|
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
|
|
|
|
// Note that the structure of the jmp_buf used here is not compatible
|
|
// with that used by libc, and is not designed to be. Specifically, it
|
|
// stores only those 'reserved' registers that LLVM does not otherwise
|
|
// understand how to spill. Also, by convention, by the time this
|
|
// intrinsic is called, Clang has already stored the frame address in the
|
|
// first slot of the buffer and stack address in the third. Following the
|
|
// X86 target code, we'll store the jump address in the second slot. We also
|
|
// need to save the TOC pointer (R2) to handle jumps between shared
|
|
// libraries, and that will be stored in the fourth slot. The thread
|
|
// identifier (R13) is not affected.
|
|
|
|
// thisMBB:
|
|
const int64_t LabelOffset = 1 * PVT.getStoreSize();
|
|
const int64_t TOCOffset = 3 * PVT.getStoreSize();
|
|
const int64_t BPOffset = 4 * PVT.getStoreSize();
|
|
|
|
// Prepare IP either in reg.
|
|
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
|
|
Register LabelReg = MRI.createVirtualRegister(PtrRC);
|
|
Register BufReg = MI.getOperand(1).getReg();
|
|
|
|
if (Subtarget.is64BitELFABI()) {
|
|
setUsesTOCBasePtr(*MBB->getParent());
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
|
|
.addReg(PPC::X2)
|
|
.addImm(TOCOffset)
|
|
.addReg(BufReg)
|
|
.cloneMemRefs(MI);
|
|
}
|
|
|
|
// Naked functions never have a base pointer, and so we use r1. For all
|
|
// other functions, this decision must be delayed until during PEI.
|
|
unsigned BaseReg;
|
|
if (MF->getFunction().hasFnAttribute(Attribute::Naked))
|
|
BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
|
|
else
|
|
BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
|
|
|
|
MIB = BuildMI(*thisMBB, MI, DL,
|
|
TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
|
|
.addReg(BaseReg)
|
|
.addImm(BPOffset)
|
|
.addReg(BufReg)
|
|
.cloneMemRefs(MI);
|
|
|
|
// Setup
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
|
|
MIB.addRegMask(TRI->getNoPreservedMask());
|
|
|
|
BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
|
|
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
|
|
.addMBB(mainMBB);
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
|
|
|
|
thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
|
|
thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
|
|
|
|
// mainMBB:
|
|
// mainDstReg = 0
|
|
MIB =
|
|
BuildMI(mainMBB, DL,
|
|
TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
|
|
|
|
// Store IP
|
|
if (Subtarget.isPPC64()) {
|
|
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
|
|
.addReg(LabelReg)
|
|
.addImm(LabelOffset)
|
|
.addReg(BufReg);
|
|
} else {
|
|
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
|
|
.addReg(LabelReg)
|
|
.addImm(LabelOffset)
|
|
.addReg(BufReg);
|
|
}
|
|
MIB.cloneMemRefs(MI);
|
|
|
|
BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
|
|
mainMBB->addSuccessor(sinkMBB);
|
|
|
|
// sinkMBB:
|
|
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
|
|
TII->get(PPC::PHI), DstReg)
|
|
.addReg(mainDstReg).addMBB(mainMBB)
|
|
.addReg(restoreDstReg).addMBB(thisMBB);
|
|
|
|
MI.eraseFromParent();
|
|
return sinkMBB;
|
|
}
|
|
|
|
MachineBasicBlock *
|
|
PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
|
|
MachineBasicBlock *MBB) const {
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
|
|
MachineFunction *MF = MBB->getParent();
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
MVT PVT = getPointerTy(MF->getDataLayout());
|
|
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
|
|
"Invalid Pointer Size!");
|
|
|
|
const TargetRegisterClass *RC =
|
|
(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
|
|
Register Tmp = MRI.createVirtualRegister(RC);
|
|
// Since FP is only updated here but NOT referenced, it's treated as GPR.
|
|
unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
|
|
unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
|
|
unsigned BP =
|
|
(PVT == MVT::i64)
|
|
? PPC::X30
|
|
: (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
|
|
: PPC::R30);
|
|
|
|
MachineInstrBuilder MIB;
|
|
|
|
const int64_t LabelOffset = 1 * PVT.getStoreSize();
|
|
const int64_t SPOffset = 2 * PVT.getStoreSize();
|
|
const int64_t TOCOffset = 3 * PVT.getStoreSize();
|
|
const int64_t BPOffset = 4 * PVT.getStoreSize();
|
|
|
|
Register BufReg = MI.getOperand(0).getReg();
|
|
|
|
// Reload FP (the jumped-to function may not have had a
|
|
// frame pointer, and if so, then its r31 will be restored
|
|
// as necessary).
|
|
if (PVT == MVT::i64) {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
|
|
.addImm(0)
|
|
.addReg(BufReg);
|
|
} else {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
|
|
.addImm(0)
|
|
.addReg(BufReg);
|
|
}
|
|
MIB.cloneMemRefs(MI);
|
|
|
|
// Reload IP
|
|
if (PVT == MVT::i64) {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
|
|
.addImm(LabelOffset)
|
|
.addReg(BufReg);
|
|
} else {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
|
|
.addImm(LabelOffset)
|
|
.addReg(BufReg);
|
|
}
|
|
MIB.cloneMemRefs(MI);
|
|
|
|
// Reload SP
|
|
if (PVT == MVT::i64) {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
|
|
.addImm(SPOffset)
|
|
.addReg(BufReg);
|
|
} else {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
|
|
.addImm(SPOffset)
|
|
.addReg(BufReg);
|
|
}
|
|
MIB.cloneMemRefs(MI);
|
|
|
|
// Reload BP
|
|
if (PVT == MVT::i64) {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
|
|
.addImm(BPOffset)
|
|
.addReg(BufReg);
|
|
} else {
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
|
|
.addImm(BPOffset)
|
|
.addReg(BufReg);
|
|
}
|
|
MIB.cloneMemRefs(MI);
|
|
|
|
// Reload TOC
|
|
if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
|
|
setUsesTOCBasePtr(*MBB->getParent());
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
|
|
.addImm(TOCOffset)
|
|
.addReg(BufReg)
|
|
.cloneMemRefs(MI);
|
|
}
|
|
|
|
// Jump
|
|
BuildMI(*MBB, MI, DL,
|
|
TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
|
|
BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
|
|
|
|
MI.eraseFromParent();
|
|
return MBB;
|
|
}
|
|
|
|
MachineBasicBlock *
|
|
PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
|
MachineBasicBlock *BB) const {
|
|
if (MI.getOpcode() == TargetOpcode::STACKMAP ||
|
|
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
|
|
if (Subtarget.is64BitELFABI() &&
|
|
MI.getOpcode() == TargetOpcode::PATCHPOINT) {
|
|
// Call lowering should have added an r2 operand to indicate a dependence
|
|
// on the TOC base pointer value. It can't however, because there is no
|
|
// way to mark the dependence as implicit there, and so the stackmap code
|
|
// will confuse it with a regular operand. Instead, add the dependence
|
|
// here.
|
|
MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
|
|
}
|
|
|
|
return emitPatchPoint(MI, BB);
|
|
}
|
|
|
|
if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
|
|
MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
|
|
return emitEHSjLjSetJmp(MI, BB);
|
|
} else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
|
|
MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
|
|
return emitEHSjLjLongJmp(MI, BB);
|
|
}
|
|
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
|
|
// To "insert" these instructions we actually have to insert their
|
|
// control-flow patterns.
|
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
|
MachineFunction::iterator It = ++BB->getIterator();
|
|
|
|
MachineFunction *F = BB->getParent();
|
|
|
|
if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
|
|
MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 ||
|
|
MI.getOpcode() == PPC::SELECT_I8) {
|
|
SmallVector<MachineOperand, 2> Cond;
|
|
if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
|
|
MI.getOpcode() == PPC::SELECT_CC_I8)
|
|
Cond.push_back(MI.getOperand(4));
|
|
else
|
|
Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
|
|
Cond.push_back(MI.getOperand(1));
|
|
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
|
|
MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
|
|
} else if (MI.getOpcode() == PPC::SELECT_CC_F4 ||
|
|
MI.getOpcode() == PPC::SELECT_CC_F8 ||
|
|
MI.getOpcode() == PPC::SELECT_CC_F16 ||
|
|
MI.getOpcode() == PPC::SELECT_CC_QFRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_QSRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_QBRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_VRRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_VSRC ||
|
|
MI.getOpcode() == PPC::SELECT_CC_SPE4 ||
|
|
MI.getOpcode() == PPC::SELECT_CC_SPE ||
|
|
MI.getOpcode() == PPC::SELECT_F4 ||
|
|
MI.getOpcode() == PPC::SELECT_F8 ||
|
|
MI.getOpcode() == PPC::SELECT_F16 ||
|
|
MI.getOpcode() == PPC::SELECT_QFRC ||
|
|
MI.getOpcode() == PPC::SELECT_QSRC ||
|
|
MI.getOpcode() == PPC::SELECT_QBRC ||
|
|
MI.getOpcode() == PPC::SELECT_SPE ||
|
|
MI.getOpcode() == PPC::SELECT_SPE4 ||
|
|
MI.getOpcode() == PPC::SELECT_VRRC ||
|
|
MI.getOpcode() == PPC::SELECT_VSFRC ||
|
|
MI.getOpcode() == PPC::SELECT_VSSRC ||
|
|
MI.getOpcode() == PPC::SELECT_VSRC) {
|
|
// The incoming instruction knows the destination vreg to set, the
|
|
// condition code register to branch on, the true/false values to
|
|
// select between, and a branch opcode to use.
|
|
|
|
// thisMBB:
|
|
// ...
|
|
// TrueVal = ...
|
|
// cmpTY ccX, r1, r2
|
|
// bCC copy1MBB
|
|
// fallthrough --> copy0MBB
|
|
MachineBasicBlock *thisMBB = BB;
|
|
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
F->insert(It, copy0MBB);
|
|
F->insert(It, sinkMBB);
|
|
|
|
// Transfer the remainder of BB and its successor edges to sinkMBB.
|
|
sinkMBB->splice(sinkMBB->begin(), BB,
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
|
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
// Next, add the true and fallthrough blocks as its successors.
|
|
BB->addSuccessor(copy0MBB);
|
|
BB->addSuccessor(sinkMBB);
|
|
|
|
if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
|
|
MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
|
|
MI.getOpcode() == PPC::SELECT_F16 ||
|
|
MI.getOpcode() == PPC::SELECT_SPE4 ||
|
|
MI.getOpcode() == PPC::SELECT_SPE ||
|
|
MI.getOpcode() == PPC::SELECT_QFRC ||
|
|
MI.getOpcode() == PPC::SELECT_QSRC ||
|
|
MI.getOpcode() == PPC::SELECT_QBRC ||
|
|
MI.getOpcode() == PPC::SELECT_VRRC ||
|
|
MI.getOpcode() == PPC::SELECT_VSFRC ||
|
|
MI.getOpcode() == PPC::SELECT_VSSRC ||
|
|
MI.getOpcode() == PPC::SELECT_VSRC) {
|
|
BuildMI(BB, dl, TII->get(PPC::BC))
|
|
.addReg(MI.getOperand(1).getReg())
|
|
.addMBB(sinkMBB);
|
|
} else {
|
|
unsigned SelectPred = MI.getOperand(4).getImm();
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(SelectPred)
|
|
.addReg(MI.getOperand(1).getReg())
|
|
.addMBB(sinkMBB);
|
|
}
|
|
|
|
// copy0MBB:
|
|
// %FalseValue = ...
|
|
// # fallthrough to sinkMBB
|
|
BB = copy0MBB;
|
|
|
|
// Update machine-CFG edges
|
|
BB->addSuccessor(sinkMBB);
|
|
|
|
// sinkMBB:
|
|
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
|
|
// ...
|
|
BB = sinkMBB;
|
|
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
|
|
.addReg(MI.getOperand(3).getReg())
|
|
.addMBB(copy0MBB)
|
|
.addReg(MI.getOperand(2).getReg())
|
|
.addMBB(thisMBB);
|
|
} else if (MI.getOpcode() == PPC::ReadTB) {
|
|
// To read the 64-bit time-base register on a 32-bit target, we read the
|
|
// two halves. Should the counter have wrapped while it was being read, we
|
|
// need to try again.
|
|
// ...
|
|
// readLoop:
|
|
// mfspr Rx,TBU # load from TBU
|
|
// mfspr Ry,TB # load from TB
|
|
// mfspr Rz,TBU # load from TBU
|
|
// cmpw crX,Rx,Rz # check if 'old'='new'
|
|
// bne readLoop # branch if they're not equal
|
|
// ...
|
|
|
|
MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
F->insert(It, readMBB);
|
|
F->insert(It, sinkMBB);
|
|
|
|
// Transfer the remainder of BB and its successor edges to sinkMBB.
|
|
sinkMBB->splice(sinkMBB->begin(), BB,
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
|
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
BB->addSuccessor(readMBB);
|
|
BB = readMBB;
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
|
|
Register LoReg = MI.getOperand(0).getReg();
|
|
Register HiReg = MI.getOperand(1).getReg();
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
|
|
BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
|
|
BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
|
|
|
|
Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
|
|
.addReg(HiReg)
|
|
.addReg(ReadAgainReg);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE)
|
|
.addReg(CmpReg)
|
|
.addMBB(readMBB);
|
|
|
|
BB->addSuccessor(readMBB);
|
|
BB->addSuccessor(sinkMBB);
|
|
} else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE);
|
|
|
|
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
|
|
BB = EmitAtomicBinary(MI, BB, 4, 0);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
|
|
BB = EmitAtomicBinary(MI, BB, 8, 0);
|
|
else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
|
|
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
|
|
(Subtarget.hasPartwordAtomics() &&
|
|
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
|
|
(Subtarget.hasPartwordAtomics() &&
|
|
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
|
|
bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
|
|
|
|
auto LoadMnemonic = PPC::LDARX;
|
|
auto StoreMnemonic = PPC::STDCX;
|
|
switch (MI.getOpcode()) {
|
|
default:
|
|
llvm_unreachable("Compare and swap of unknown size");
|
|
case PPC::ATOMIC_CMP_SWAP_I8:
|
|
LoadMnemonic = PPC::LBARX;
|
|
StoreMnemonic = PPC::STBCX;
|
|
assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
|
|
break;
|
|
case PPC::ATOMIC_CMP_SWAP_I16:
|
|
LoadMnemonic = PPC::LHARX;
|
|
StoreMnemonic = PPC::STHCX;
|
|
assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
|
|
break;
|
|
case PPC::ATOMIC_CMP_SWAP_I32:
|
|
LoadMnemonic = PPC::LWARX;
|
|
StoreMnemonic = PPC::STWCX;
|
|
break;
|
|
case PPC::ATOMIC_CMP_SWAP_I64:
|
|
LoadMnemonic = PPC::LDARX;
|
|
StoreMnemonic = PPC::STDCX;
|
|
break;
|
|
}
|
|
Register dest = MI.getOperand(0).getReg();
|
|
Register ptrA = MI.getOperand(1).getReg();
|
|
Register ptrB = MI.getOperand(2).getReg();
|
|
Register oldval = MI.getOperand(3).getReg();
|
|
Register newval = MI.getOperand(4).getReg();
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
F->insert(It, loop1MBB);
|
|
F->insert(It, loop2MBB);
|
|
F->insert(It, midMBB);
|
|
F->insert(It, exitMBB);
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
// thisMBB:
|
|
// ...
|
|
// fallthrough --> loopMBB
|
|
BB->addSuccessor(loop1MBB);
|
|
|
|
// loop1MBB:
|
|
// l[bhwd]arx dest, ptr
|
|
// cmp[wd] dest, oldval
|
|
// bne- midMBB
|
|
// loop2MBB:
|
|
// st[bhwd]cx. newval, ptr
|
|
// bne- loopMBB
|
|
// b exitBB
|
|
// midMBB:
|
|
// st[bhwd]cx. dest, ptr
|
|
// exitBB:
|
|
BB = loop1MBB;
|
|
BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
|
|
.addReg(oldval)
|
|
.addReg(dest);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE)
|
|
.addReg(PPC::CR0)
|
|
.addMBB(midMBB);
|
|
BB->addSuccessor(loop2MBB);
|
|
BB->addSuccessor(midMBB);
|
|
|
|
BB = loop2MBB;
|
|
BuildMI(BB, dl, TII->get(StoreMnemonic))
|
|
.addReg(newval)
|
|
.addReg(ptrA)
|
|
.addReg(ptrB);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE)
|
|
.addReg(PPC::CR0)
|
|
.addMBB(loop1MBB);
|
|
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
|
|
BB->addSuccessor(loop1MBB);
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
BB = midMBB;
|
|
BuildMI(BB, dl, TII->get(StoreMnemonic))
|
|
.addReg(dest)
|
|
.addReg(ptrA)
|
|
.addReg(ptrB);
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
// exitMBB:
|
|
// ...
|
|
BB = exitMBB;
|
|
} else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
|
|
MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
|
|
// We must use 64-bit registers for addresses when targeting 64-bit,
|
|
// since we're actually doing arithmetic on them. Other registers
|
|
// can be 32-bit.
|
|
bool is64bit = Subtarget.isPPC64();
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
|
|
|
|
Register dest = MI.getOperand(0).getReg();
|
|
Register ptrA = MI.getOperand(1).getReg();
|
|
Register ptrB = MI.getOperand(2).getReg();
|
|
Register oldval = MI.getOperand(3).getReg();
|
|
Register newval = MI.getOperand(4).getReg();
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
|
|
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
F->insert(It, loop1MBB);
|
|
F->insert(It, loop2MBB);
|
|
F->insert(It, midMBB);
|
|
F->insert(It, exitMBB);
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
const TargetRegisterClass *RC =
|
|
is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
|
|
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
|
|
|
|
Register PtrReg = RegInfo.createVirtualRegister(RC);
|
|
Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register ShiftReg =
|
|
isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
|
|
Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register MaskReg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
|
|
Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
|
|
Register Ptr1Reg;
|
|
Register TmpReg = RegInfo.createVirtualRegister(GPRC);
|
|
Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
|
|
// thisMBB:
|
|
// ...
|
|
// fallthrough --> loopMBB
|
|
BB->addSuccessor(loop1MBB);
|
|
|
|
// The 4-byte load must be aligned, while a char or short may be
|
|
// anywhere in the word. Hence all this nasty bookkeeping code.
|
|
// add ptr1, ptrA, ptrB [copy if ptrA==0]
|
|
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
|
|
// xori shift, shift1, 24 [16]
|
|
// rlwinm ptr, ptr1, 0, 0, 29
|
|
// slw newval2, newval, shift
|
|
// slw oldval2, oldval,shift
|
|
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
|
|
// slw mask, mask2, shift
|
|
// and newval3, newval2, mask
|
|
// and oldval3, oldval2, mask
|
|
// loop1MBB:
|
|
// lwarx tmpDest, ptr
|
|
// and tmp, tmpDest, mask
|
|
// cmpw tmp, oldval3
|
|
// bne- midMBB
|
|
// loop2MBB:
|
|
// andc tmp2, tmpDest, mask
|
|
// or tmp4, tmp2, newval3
|
|
// stwcx. tmp4, ptr
|
|
// bne- loop1MBB
|
|
// b exitBB
|
|
// midMBB:
|
|
// stwcx. tmpDest, ptr
|
|
// exitBB:
|
|
// srw dest, tmpDest, shift
|
|
if (ptrA != ZeroReg) {
|
|
Ptr1Reg = RegInfo.createVirtualRegister(RC);
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
|
|
.addReg(ptrA)
|
|
.addReg(ptrB);
|
|
} else {
|
|
Ptr1Reg = ptrB;
|
|
}
|
|
|
|
// We need use 32-bit subregister to avoid mismatch register class in 64-bit
|
|
// mode.
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
|
|
.addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
|
|
.addImm(3)
|
|
.addImm(27)
|
|
.addImm(is8bit ? 28 : 27);
|
|
if (!isLittleEndian)
|
|
BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
|
|
.addReg(Shift1Reg)
|
|
.addImm(is8bit ? 24 : 16);
|
|
if (is64bit)
|
|
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
|
|
.addReg(Ptr1Reg)
|
|
.addImm(0)
|
|
.addImm(61);
|
|
else
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
|
|
.addReg(Ptr1Reg)
|
|
.addImm(0)
|
|
.addImm(0)
|
|
.addImm(29);
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
|
|
.addReg(newval)
|
|
.addReg(ShiftReg);
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
|
|
.addReg(oldval)
|
|
.addReg(ShiftReg);
|
|
if (is8bit)
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
|
|
else {
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
|
|
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
|
|
.addReg(Mask3Reg)
|
|
.addImm(65535);
|
|
}
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
|
|
.addReg(Mask2Reg)
|
|
.addReg(ShiftReg);
|
|
BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
|
|
.addReg(NewVal2Reg)
|
|
.addReg(MaskReg);
|
|
BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
|
|
.addReg(OldVal2Reg)
|
|
.addReg(MaskReg);
|
|
|
|
BB = loop1MBB;
|
|
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
|
|
.addReg(ZeroReg)
|
|
.addReg(PtrReg);
|
|
BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)
|
|
.addReg(TmpDestReg)
|
|
.addReg(MaskReg);
|
|
BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
|
|
.addReg(TmpReg)
|
|
.addReg(OldVal3Reg);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE)
|
|
.addReg(PPC::CR0)
|
|
.addMBB(midMBB);
|
|
BB->addSuccessor(loop2MBB);
|
|
BB->addSuccessor(midMBB);
|
|
|
|
BB = loop2MBB;
|
|
BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
|
|
.addReg(TmpDestReg)
|
|
.addReg(MaskReg);
|
|
BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)
|
|
.addReg(Tmp2Reg)
|
|
.addReg(NewVal3Reg);
|
|
BuildMI(BB, dl, TII->get(PPC::STWCX))
|
|
.addReg(Tmp4Reg)
|
|
.addReg(ZeroReg)
|
|
.addReg(PtrReg);
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
.addImm(PPC::PRED_NE)
|
|
.addReg(PPC::CR0)
|
|
.addMBB(loop1MBB);
|
|
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
|
|
BB->addSuccessor(loop1MBB);
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
BB = midMBB;
|
|
BuildMI(BB, dl, TII->get(PPC::STWCX))
|
|
.addReg(TmpDestReg)
|
|
.addReg(ZeroReg)
|
|
.addReg(PtrReg);
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
// exitMBB:
|
|
// ...
|
|
BB = exitMBB;
|
|
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
|
|
.addReg(TmpReg)
|
|
.addReg(ShiftReg);
|
|
} else if (MI.getOpcode() == PPC::FADDrtz) {
|
|
// This pseudo performs an FADD with rounding mode temporarily forced
|
|
// to round-to-zero. We emit this via custom inserter since the FPSCR
|
|
// is not modeled at the SelectionDAG level.
|
|
Register Dest = MI.getOperand(0).getReg();
|
|
Register Src1 = MI.getOperand(1).getReg();
|
|
Register Src2 = MI.getOperand(2).getReg();
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
|
|
|
|
// Save FPSCR value.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
|
|
|
|
// Set rounding mode to round-to-zero.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
|
|
|
|
// Perform addition.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
|
|
|
|
// Restore FPSCR value.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
|
|
} else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
|
|
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||
|
|
MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
|
|
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
|
|
unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
|
|
MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
|
|
? PPC::ANDI8_rec
|
|
: PPC::ANDI_rec;
|
|
bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
|
|
MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
Register Dest = RegInfo.createVirtualRegister(
|
|
Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
|
|
|
|
DebugLoc Dl = MI.getDebugLoc();
|
|
BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)
|
|
.addReg(MI.getOperand(1).getReg())
|
|
.addImm(1);
|
|
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
|
|
MI.getOperand(0).getReg())
|
|
.addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);
|
|
} else if (MI.getOpcode() == PPC::TCHECK_RET) {
|
|
DebugLoc Dl = MI.getDebugLoc();
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
|
|
BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
|
|
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
|
|
MI.getOperand(0).getReg())
|
|
.addReg(CRReg);
|
|
} else if (MI.getOpcode() == PPC::TBEGIN_RET) {
|
|
DebugLoc Dl = MI.getDebugLoc();
|
|
unsigned Imm = MI.getOperand(1).getImm();
|
|
BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
|
|
BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
|
|
MI.getOperand(0).getReg())
|
|
.addReg(PPC::CR0EQ);
|
|
} else if (MI.getOpcode() == PPC::SETRNDi) {
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
Register OldFPSCRReg = MI.getOperand(0).getReg();
|
|
|
|
// Save FPSCR value.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
|
|
|
|
// The floating point rounding mode is in the bits 62:63 of FPCSR, and has
|
|
// the following settings:
|
|
// 00 Round to nearest
|
|
// 01 Round to 0
|
|
// 10 Round to +inf
|
|
// 11 Round to -inf
|
|
|
|
// When the operand is immediate, using the two least significant bits of
|
|
// the immediate to set the bits 62:63 of FPSCR.
|
|
unsigned Mode = MI.getOperand(1).getImm();
|
|
BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
|
|
.addImm(31);
|
|
|
|
BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
|
|
.addImm(30);
|
|
} else if (MI.getOpcode() == PPC::SETRND) {
|
|
DebugLoc dl = MI.getDebugLoc();
|
|
|
|
// Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
|
|
// or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
|
|
// If the target doesn't have DirectMove, we should use stack to do the
|
|
// conversion, because the target doesn't have the instructions like mtvsrd
|
|
// or mfvsrd to do this conversion directly.
|
|
auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
|
|
if (Subtarget.hasDirectMove()) {
|
|
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
|
|
.addReg(SrcReg);
|
|
} else {
|
|
// Use stack to do the register copy.
|
|
unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
|
|
if (RC == &PPC::F8RCRegClass) {
|
|
// Copy register from F8RCRegClass to G8RCRegclass.
|
|
assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
|
|
"Unsupported RegClass.");
|
|
|
|
StoreOp = PPC::STFD;
|
|
LoadOp = PPC::LD;
|
|
} else {
|
|
// Copy register from G8RCRegClass to F8RCRegclass.
|
|
assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
|
|
(RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
|
|
"Unsupported RegClass.");
|
|
}
|
|
|
|
MachineFrameInfo &MFI = F->getFrameInfo();
|
|
int FrameIdx = MFI.CreateStackObject(8, 8, false);
|
|
|
|
MachineMemOperand *MMOStore = F->getMachineMemOperand(
|
|
MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
|
|
MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
|
|
MFI.getObjectAlignment(FrameIdx));
|
|
|
|
// Store the SrcReg into the stack.
|
|
BuildMI(*BB, MI, dl, TII->get(StoreOp))
|
|
.addReg(SrcReg)
|
|
.addImm(0)
|
|
.addFrameIndex(FrameIdx)
|
|
.addMemOperand(MMOStore);
|
|
|
|
MachineMemOperand *MMOLoad = F->getMachineMemOperand(
|
|
MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
|
|
MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
|
|
MFI.getObjectAlignment(FrameIdx));
|
|
|
|
// Load from the stack where SrcReg is stored, and save to DestReg,
|
|
// so we have done the RegClass conversion from RegClass::SrcReg to
|
|
// RegClass::DestReg.
|
|
BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
|
|
.addImm(0)
|
|
.addFrameIndex(FrameIdx)
|
|
.addMemOperand(MMOLoad);
|
|
}
|
|
};
|
|
|
|
Register OldFPSCRReg = MI.getOperand(0).getReg();
|
|
|
|
// Save FPSCR value.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
|
|
|
|
// When the operand is gprc register, use two least significant bits of the
|
|
// register and mtfsf instruction to set the bits 62:63 of FPSCR.
|
|
//
|
|
// copy OldFPSCRTmpReg, OldFPSCRReg
|
|
// (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
|
|
// rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
|
|
// copy NewFPSCRReg, NewFPSCRTmpReg
|
|
// mtfsf 255, NewFPSCRReg
|
|
MachineOperand SrcOp = MI.getOperand(1);
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
|
|
|
|
copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
|
|
|
|
Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
|
|
Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
|
|
|
|
// The first operand of INSERT_SUBREG should be a register which has
|
|
// subregisters, we only care about its RegClass, so we should use an
|
|
// IMPLICIT_DEF register.
|
|
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
|
|
.addReg(ImDefReg)
|
|
.add(SrcOp)
|
|
.addImm(1);
|
|
|
|
Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
|
|
.addReg(OldFPSCRTmpReg)
|
|
.addReg(ExtSrcReg)
|
|
.addImm(0)
|
|
.addImm(62);
|
|
|
|
Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
|
|
copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
|
|
|
|
// The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
|
|
// bits of FPSCR.
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
|
|
.addImm(255)
|
|
.addReg(NewFPSCRReg)
|
|
.addImm(0)
|
|
.addImm(0);
|
|
} else {
|
|
llvm_unreachable("Unexpected instr type to insert");
|
|
}
|
|
|
|
MI.eraseFromParent(); // The pseudo instruction is gone now.
|
|
return BB;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Target Optimization Hooks
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
|
|
// For the estimates, convergence is quadratic, so we essentially double the
|
|
// number of digits correct after every iteration. For both FRE and FRSQRTE,
|
|
// the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
|
|
// this is 2^-14. IEEE float has 23 digits and double has 52 digits.
|
|
int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
|
|
if (VT.getScalarType() == MVT::f64)
|
|
RefinementSteps++;
|
|
return RefinementSteps;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
|
|
int Enabled, int &RefinementSteps,
|
|
bool &UseOneConstNR,
|
|
bool Reciprocal) const {
|
|
EVT VT = Operand.getValueType();
|
|
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
|
|
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
|
|
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
|
|
(VT == MVT::v2f64 && Subtarget.hasVSX()) ||
|
|
(VT == MVT::v4f32 && Subtarget.hasQPX()) ||
|
|
(VT == MVT::v4f64 && Subtarget.hasQPX())) {
|
|
if (RefinementSteps == ReciprocalEstimate::Unspecified)
|
|
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
|
|
|
|
// The Newton-Raphson computation with a single constant does not provide
|
|
// enough accuracy on some CPUs.
|
|
UseOneConstNR = !Subtarget.needsTwoConstNR();
|
|
return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
|
|
int Enabled,
|
|
int &RefinementSteps) const {
|
|
EVT VT = Operand.getValueType();
|
|
if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
|
|
(VT == MVT::f64 && Subtarget.hasFRE()) ||
|
|
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
|
|
(VT == MVT::v2f64 && Subtarget.hasVSX()) ||
|
|
(VT == MVT::v4f32 && Subtarget.hasQPX()) ||
|
|
(VT == MVT::v4f64 && Subtarget.hasQPX())) {
|
|
if (RefinementSteps == ReciprocalEstimate::Unspecified)
|
|
RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
|
|
return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
|
|
// Note: This functionality is used only when unsafe-fp-math is enabled, and
|
|
// on cores with reciprocal estimates (which are used when unsafe-fp-math is
|
|
// enabled for division), this functionality is redundant with the default
|
|
// combiner logic (once the division -> reciprocal/multiply transformation
|
|
// has taken place). As a result, this matters more for older cores than for
|
|
// newer ones.
|
|
|
|
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
|
|
// reciprocal if there are two or more FDIVs (for embedded cores with only
|
|
// one FP pipeline) for three or more FDIVs (for generic OOO cores).
|
|
switch (Subtarget.getCPUDirective()) {
|
|
default:
|
|
return 3;
|
|
case PPC::DIR_440:
|
|
case PPC::DIR_A2:
|
|
case PPC::DIR_E500:
|
|
case PPC::DIR_E500mc:
|
|
case PPC::DIR_E5500:
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
// isConsecutiveLSLoc needs to work even if all adds have not yet been
|
|
// collapsed, and so we need to look through chains of them.
|
|
static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
|
|
int64_t& Offset, SelectionDAG &DAG) {
|
|
if (DAG.isBaseWithConstantOffset(Loc)) {
|
|
Base = Loc.getOperand(0);
|
|
Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
|
|
|
|
// The base might itself be a base plus an offset, and if so, accumulate
|
|
// that as well.
|
|
getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
|
|
}
|
|
}
|
|
|
|
static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
|
|
unsigned Bytes, int Dist,
|
|
SelectionDAG &DAG) {
|
|
if (VT.getSizeInBits() / 8 != Bytes)
|
|
return false;
|
|
|
|
SDValue BaseLoc = Base->getBasePtr();
|
|
if (Loc.getOpcode() == ISD::FrameIndex) {
|
|
if (BaseLoc.getOpcode() != ISD::FrameIndex)
|
|
return false;
|
|
const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
|
|
int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
|
|
int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
|
|
int FS = MFI.getObjectSize(FI);
|
|
int BFS = MFI.getObjectSize(BFI);
|
|
if (FS != BFS || FS != (int)Bytes) return false;
|
|
return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
|
|
}
|
|
|
|
SDValue Base1 = Loc, Base2 = BaseLoc;
|
|
int64_t Offset1 = 0, Offset2 = 0;
|
|
getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
|
|
getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
|
|
if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
|
|
return true;
|
|
|
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
|
const GlobalValue *GV1 = nullptr;
|
|
const GlobalValue *GV2 = nullptr;
|
|
Offset1 = 0;
|
|
Offset2 = 0;
|
|
bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
|
|
bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
|
|
if (isGA1 && isGA2 && GV1 == GV2)
|
|
return Offset1 == (Offset2 + Dist*Bytes);
|
|
return false;
|
|
}
|
|
|
|
// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
|
|
// not enforce equality of the chain operands.
|
|
static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
|
|
unsigned Bytes, int Dist,
|
|
SelectionDAG &DAG) {
|
|
if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
|
|
EVT VT = LS->getMemoryVT();
|
|
SDValue Loc = LS->getBasePtr();
|
|
return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
|
|
}
|
|
|
|
if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
|
|
EVT VT;
|
|
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
|
|
default: return false;
|
|
case Intrinsic::ppc_qpx_qvlfd:
|
|
case Intrinsic::ppc_qpx_qvlfda:
|
|
VT = MVT::v4f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfs:
|
|
case Intrinsic::ppc_qpx_qvlfsa:
|
|
VT = MVT::v4f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfcd:
|
|
case Intrinsic::ppc_qpx_qvlfcda:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfcs:
|
|
case Intrinsic::ppc_qpx_qvlfcsa:
|
|
VT = MVT::v2f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfiwa:
|
|
case Intrinsic::ppc_qpx_qvlfiwz:
|
|
case Intrinsic::ppc_altivec_lvx:
|
|
case Intrinsic::ppc_altivec_lvxl:
|
|
case Intrinsic::ppc_vsx_lxvw4x:
|
|
case Intrinsic::ppc_vsx_lxvw4x_be:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
case Intrinsic::ppc_vsx_lxvd2x:
|
|
case Intrinsic::ppc_vsx_lxvd2x_be:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_altivec_lvebx:
|
|
VT = MVT::i8;
|
|
break;
|
|
case Intrinsic::ppc_altivec_lvehx:
|
|
VT = MVT::i16;
|
|
break;
|
|
case Intrinsic::ppc_altivec_lvewx:
|
|
VT = MVT::i32;
|
|
break;
|
|
}
|
|
|
|
return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
|
|
}
|
|
|
|
if (N->getOpcode() == ISD::INTRINSIC_VOID) {
|
|
EVT VT;
|
|
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
|
|
default: return false;
|
|
case Intrinsic::ppc_qpx_qvstfd:
|
|
case Intrinsic::ppc_qpx_qvstfda:
|
|
VT = MVT::v4f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfs:
|
|
case Intrinsic::ppc_qpx_qvstfsa:
|
|
VT = MVT::v4f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfcd:
|
|
case Intrinsic::ppc_qpx_qvstfcda:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfcs:
|
|
case Intrinsic::ppc_qpx_qvstfcsa:
|
|
VT = MVT::v2f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfiw:
|
|
case Intrinsic::ppc_qpx_qvstfiwa:
|
|
case Intrinsic::ppc_altivec_stvx:
|
|
case Intrinsic::ppc_altivec_stvxl:
|
|
case Intrinsic::ppc_vsx_stxvw4x:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
case Intrinsic::ppc_vsx_stxvd2x:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_vsx_stxvw4x_be:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
case Intrinsic::ppc_vsx_stxvd2x_be:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_altivec_stvebx:
|
|
VT = MVT::i8;
|
|
break;
|
|
case Intrinsic::ppc_altivec_stvehx:
|
|
VT = MVT::i16;
|
|
break;
|
|
case Intrinsic::ppc_altivec_stvewx:
|
|
VT = MVT::i32;
|
|
break;
|
|
}
|
|
|
|
return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Return true is there is a nearyby consecutive load to the one provided
|
|
// (regardless of alignment). We search up and down the chain, looking though
|
|
// token factors and other loads (but nothing else). As a result, a true result
|
|
// indicates that it is safe to create a new consecutive load adjacent to the
|
|
// load provided.
|
|
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
|
|
SDValue Chain = LD->getChain();
|
|
EVT VT = LD->getMemoryVT();
|
|
|
|
SmallSet<SDNode *, 16> LoadRoots;
|
|
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
|
|
SmallSet<SDNode *, 16> Visited;
|
|
|
|
// First, search up the chain, branching to follow all token-factor operands.
|
|
// If we find a consecutive load, then we're done, otherwise, record all
|
|
// nodes just above the top-level loads and token factors.
|
|
while (!Queue.empty()) {
|
|
SDNode *ChainNext = Queue.pop_back_val();
|
|
if (!Visited.insert(ChainNext).second)
|
|
continue;
|
|
|
|
if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
|
|
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
|
|
return true;
|
|
|
|
if (!Visited.count(ChainLD->getChain().getNode()))
|
|
Queue.push_back(ChainLD->getChain().getNode());
|
|
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
|
|
for (const SDUse &O : ChainNext->ops())
|
|
if (!Visited.count(O.getNode()))
|
|
Queue.push_back(O.getNode());
|
|
} else
|
|
LoadRoots.insert(ChainNext);
|
|
}
|
|
|
|
// Second, search down the chain, starting from the top-level nodes recorded
|
|
// in the first phase. These top-level nodes are the nodes just above all
|
|
// loads and token factors. Starting with their uses, recursively look though
|
|
// all loads (just the chain uses) and token factors to find a consecutive
|
|
// load.
|
|
Visited.clear();
|
|
Queue.clear();
|
|
|
|
for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
|
|
IE = LoadRoots.end(); I != IE; ++I) {
|
|
Queue.push_back(*I);
|
|
|
|
while (!Queue.empty()) {
|
|
SDNode *LoadRoot = Queue.pop_back_val();
|
|
if (!Visited.insert(LoadRoot).second)
|
|
continue;
|
|
|
|
if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
|
|
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
|
|
return true;
|
|
|
|
for (SDNode::use_iterator UI = LoadRoot->use_begin(),
|
|
UE = LoadRoot->use_end(); UI != UE; ++UI)
|
|
if (((isa<MemSDNode>(*UI) &&
|
|
cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
|
|
UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
|
|
Queue.push_back(*UI);
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// This function is called when we have proved that a SETCC node can be replaced
|
|
/// by subtraction (and other supporting instructions) so that the result of
|
|
/// comparison is kept in a GPR instead of CR. This function is purely for
|
|
/// codegen purposes and has some flags to guide the codegen process.
|
|
static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
|
|
bool Swap, SDLoc &DL, SelectionDAG &DAG) {
|
|
assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
|
|
|
|
// Zero extend the operands to the largest legal integer. Originally, they
|
|
// must be of a strictly smaller size.
|
|
auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
|
|
DAG.getConstant(Size, DL, MVT::i32));
|
|
auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
|
|
DAG.getConstant(Size, DL, MVT::i32));
|
|
|
|
// Swap if needed. Depends on the condition code.
|
|
if (Swap)
|
|
std::swap(Op0, Op1);
|
|
|
|
// Subtract extended integers.
|
|
auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
|
|
|
|
// Move the sign bit to the least significant position and zero out the rest.
|
|
// Now the least significant bit carries the result of original comparison.
|
|
auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
|
|
DAG.getConstant(Size - 1, DL, MVT::i32));
|
|
auto Final = Shifted;
|
|
|
|
// Complement the result if needed. Based on the condition code.
|
|
if (Complement)
|
|
Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
|
|
DAG.getConstant(1, DL, MVT::i64));
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc DL(N);
|
|
|
|
// Size of integers being compared has a critical role in the following
|
|
// analysis, so we prefer to do this when all types are legal.
|
|
if (!DCI.isAfterLegalizeDAG())
|
|
return SDValue();
|
|
|
|
// If all users of SETCC extend its value to a legal integer type
|
|
// then we replace SETCC with a subtraction
|
|
for (SDNode::use_iterator UI = N->use_begin(),
|
|
UE = N->use_end(); UI != UE; ++UI) {
|
|
if (UI->getOpcode() != ISD::ZERO_EXTEND)
|
|
return SDValue();
|
|
}
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
|
auto OpSize = N->getOperand(0).getValueSizeInBits();
|
|
|
|
unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
|
|
|
|
if (OpSize < Size) {
|
|
switch (CC) {
|
|
default: break;
|
|
case ISD::SETULT:
|
|
return generateEquivalentSub(N, Size, false, false, DL, DAG);
|
|
case ISD::SETULE:
|
|
return generateEquivalentSub(N, Size, true, true, DL, DAG);
|
|
case ISD::SETUGT:
|
|
return generateEquivalentSub(N, Size, false, true, DL, DAG);
|
|
case ISD::SETUGE:
|
|
return generateEquivalentSub(N, Size, true, false, DL, DAG);
|
|
}
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
|
|
assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
|
|
// If we're tracking CR bits, we need to be careful that we don't have:
|
|
// trunc(binary-ops(zext(x), zext(y)))
|
|
// or
|
|
// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
|
|
// such that we're unnecessarily moving things into GPRs when it would be
|
|
// better to keep them in CR bits.
|
|
|
|
// Note that trunc here can be an actual i1 trunc, or can be the effective
|
|
// truncation that comes from a setcc or select_cc.
|
|
if (N->getOpcode() == ISD::TRUNCATE &&
|
|
N->getValueType(0) != MVT::i1)
|
|
return SDValue();
|
|
|
|
if (N->getOperand(0).getValueType() != MVT::i32 &&
|
|
N->getOperand(0).getValueType() != MVT::i64)
|
|
return SDValue();
|
|
|
|
if (N->getOpcode() == ISD::SETCC ||
|
|
N->getOpcode() == ISD::SELECT_CC) {
|
|
// If we're looking at a comparison, then we need to make sure that the
|
|
// high bits (all except for the first) don't matter the result.
|
|
ISD::CondCode CC =
|
|
cast<CondCodeSDNode>(N->getOperand(
|
|
N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
|
|
unsigned OpBits = N->getOperand(0).getValueSizeInBits();
|
|
|
|
if (ISD::isSignedIntSetCC(CC)) {
|
|
if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
|
|
DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
|
|
return SDValue();
|
|
} else if (ISD::isUnsignedIntSetCC(CC)) {
|
|
if (!DAG.MaskedValueIsZero(N->getOperand(0),
|
|
APInt::getHighBitsSet(OpBits, OpBits-1)) ||
|
|
!DAG.MaskedValueIsZero(N->getOperand(1),
|
|
APInt::getHighBitsSet(OpBits, OpBits-1)))
|
|
return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
|
|
: SDValue());
|
|
} else {
|
|
// This is neither a signed nor an unsigned comparison, just make sure
|
|
// that the high bits are equal.
|
|
KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));
|
|
KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));
|
|
|
|
// We don't really care about what is known about the first bit (if
|
|
// anything), so clear it in all masks prior to comparing them.
|
|
Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
|
|
Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);
|
|
|
|
if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One)
|
|
return SDValue();
|
|
}
|
|
}
|
|
|
|
// We now know that the higher-order bits are irrelevant, we just need to
|
|
// make sure that all of the intermediate operations are bit operations, and
|
|
// all inputs are extensions.
|
|
if (N->getOperand(0).getOpcode() != ISD::AND &&
|
|
N->getOperand(0).getOpcode() != ISD::OR &&
|
|
N->getOperand(0).getOpcode() != ISD::XOR &&
|
|
N->getOperand(0).getOpcode() != ISD::SELECT &&
|
|
N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
|
|
N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
|
|
N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
|
|
N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
|
|
N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
|
|
return SDValue();
|
|
|
|
if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
|
|
N->getOperand(1).getOpcode() != ISD::AND &&
|
|
N->getOperand(1).getOpcode() != ISD::OR &&
|
|
N->getOperand(1).getOpcode() != ISD::XOR &&
|
|
N->getOperand(1).getOpcode() != ISD::SELECT &&
|
|
N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
|
|
N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
|
|
N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
|
|
N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
|
|
N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
|
|
return SDValue();
|
|
|
|
SmallVector<SDValue, 4> Inputs;
|
|
SmallVector<SDValue, 8> BinOps, PromOps;
|
|
SmallPtrSet<SDNode *, 16> Visited;
|
|
|
|
for (unsigned i = 0; i < 2; ++i) {
|
|
if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
|
|
N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
|
|
N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
|
|
N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
|
|
isa<ConstantSDNode>(N->getOperand(i)))
|
|
Inputs.push_back(N->getOperand(i));
|
|
else
|
|
BinOps.push_back(N->getOperand(i));
|
|
|
|
if (N->getOpcode() == ISD::TRUNCATE)
|
|
break;
|
|
}
|
|
|
|
// Visit all inputs, collect all binary operations (and, or, xor and
|
|
// select) that are all fed by extensions.
|
|
while (!BinOps.empty()) {
|
|
SDValue BinOp = BinOps.back();
|
|
BinOps.pop_back();
|
|
|
|
if (!Visited.insert(BinOp.getNode()).second)
|
|
continue;
|
|
|
|
PromOps.push_back(BinOp);
|
|
|
|
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
|
|
// The condition of the select is not promoted.
|
|
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
|
|
continue;
|
|
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
|
|
continue;
|
|
|
|
if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
|
|
BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
|
|
isa<ConstantSDNode>(BinOp.getOperand(i))) {
|
|
Inputs.push_back(BinOp.getOperand(i));
|
|
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::OR ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
|
|
BinOps.push_back(BinOp.getOperand(i));
|
|
} else {
|
|
// We have an input that is not an extension or another binary
|
|
// operation; we'll abort this transformation.
|
|
return SDValue();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Make sure that this is a self-contained cluster of operations (which
|
|
// is not quite the same thing as saying that everything has only one
|
|
// use).
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
continue;
|
|
|
|
for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
|
|
UE = Inputs[i].getNode()->use_end();
|
|
UI != UE; ++UI) {
|
|
SDNode *User = *UI;
|
|
if (User != N && !Visited.count(User))
|
|
return SDValue();
|
|
|
|
// Make sure that we're not going to promote the non-output-value
|
|
// operand(s) or SELECT or SELECT_CC.
|
|
// FIXME: Although we could sometimes handle this, and it does occur in
|
|
// practice that one of the condition inputs to the select is also one of
|
|
// the outputs, we currently can't deal with this.
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
if (User->getOperand(0) == Inputs[i])
|
|
return SDValue();
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
if (User->getOperand(0) == Inputs[i] ||
|
|
User->getOperand(1) == Inputs[i])
|
|
return SDValue();
|
|
}
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
|
|
for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
|
|
UE = PromOps[i].getNode()->use_end();
|
|
UI != UE; ++UI) {
|
|
SDNode *User = *UI;
|
|
if (User != N && !Visited.count(User))
|
|
return SDValue();
|
|
|
|
// Make sure that we're not going to promote the non-output-value
|
|
// operand(s) or SELECT or SELECT_CC.
|
|
// FIXME: Although we could sometimes handle this, and it does occur in
|
|
// practice that one of the condition inputs to the select is also one of
|
|
// the outputs, we currently can't deal with this.
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
if (User->getOperand(0) == PromOps[i])
|
|
return SDValue();
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
if (User->getOperand(0) == PromOps[i] ||
|
|
User->getOperand(1) == PromOps[i])
|
|
return SDValue();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Replace all inputs with the extension operand.
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
// Constants may have users outside the cluster of to-be-promoted nodes,
|
|
// and so we need to replace those as we do the promotions.
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
continue;
|
|
else
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
|
|
}
|
|
|
|
std::list<HandleSDNode> PromOpHandles;
|
|
for (auto &PromOp : PromOps)
|
|
PromOpHandles.emplace_back(PromOp);
|
|
|
|
// Replace all operations (these are all the same, but have a different
|
|
// (i1) return type). DAG.getNode will validate that the types of
|
|
// a binary operator match, so go through the list in reverse so that
|
|
// we've likely promoted both operands first. Any intermediate truncations or
|
|
// extensions disappear.
|
|
while (!PromOpHandles.empty()) {
|
|
SDValue PromOp = PromOpHandles.back().getValue();
|
|
PromOpHandles.pop_back();
|
|
|
|
if (PromOp.getOpcode() == ISD::TRUNCATE ||
|
|
PromOp.getOpcode() == ISD::SIGN_EXTEND ||
|
|
PromOp.getOpcode() == ISD::ZERO_EXTEND ||
|
|
PromOp.getOpcode() == ISD::ANY_EXTEND) {
|
|
if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
|
|
PromOp.getOperand(0).getValueType() != MVT::i1) {
|
|
// The operand is not yet ready (see comment below).
|
|
PromOpHandles.emplace_front(PromOp);
|
|
continue;
|
|
}
|
|
|
|
SDValue RepValue = PromOp.getOperand(0);
|
|
if (isa<ConstantSDNode>(RepValue))
|
|
RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
|
|
continue;
|
|
}
|
|
|
|
unsigned C;
|
|
switch (PromOp.getOpcode()) {
|
|
default: C = 0; break;
|
|
case ISD::SELECT: C = 1; break;
|
|
case ISD::SELECT_CC: C = 2; break;
|
|
}
|
|
|
|
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
|
|
PromOp.getOperand(C).getValueType() != MVT::i1) ||
|
|
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
|
|
PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
|
|
// The to-be-promoted operands of this node have not yet been
|
|
// promoted (this should be rare because we're going through the
|
|
// list backward, but if one of the operands has several users in
|
|
// this cluster of to-be-promoted nodes, it is possible).
|
|
PromOpHandles.emplace_front(PromOp);
|
|
continue;
|
|
}
|
|
|
|
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
|
|
PromOp.getNode()->op_end());
|
|
|
|
// If there are any constant inputs, make sure they're replaced now.
|
|
for (unsigned i = 0; i < 2; ++i)
|
|
if (isa<ConstantSDNode>(Ops[C+i]))
|
|
Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(PromOp,
|
|
DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
|
|
}
|
|
|
|
// Now we're left with the initial truncation itself.
|
|
if (N->getOpcode() == ISD::TRUNCATE)
|
|
return N->getOperand(0);
|
|
|
|
// Otherwise, this is a comparison. The operands to be compared have just
|
|
// changed type (to i1), but everything else is the same.
|
|
return SDValue(N, 0);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
|
|
// If we're tracking CR bits, we need to be careful that we don't have:
|
|
// zext(binary-ops(trunc(x), trunc(y)))
|
|
// or
|
|
// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
|
|
// such that we're unnecessarily moving things into CR bits that can more
|
|
// efficiently stay in GPRs. Note that if we're not certain that the high
|
|
// bits are set as required by the final extension, we still may need to do
|
|
// some masking to get the proper behavior.
|
|
|
|
// This same functionality is important on PPC64 when dealing with
|
|
// 32-to-64-bit extensions; these occur often when 32-bit values are used as
|
|
// the return values of functions. Because it is so similar, it is handled
|
|
// here as well.
|
|
|
|
if (N->getValueType(0) != MVT::i32 &&
|
|
N->getValueType(0) != MVT::i64)
|
|
return SDValue();
|
|
|
|
if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
|
|
(N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
|
|
return SDValue();
|
|
|
|
if (N->getOperand(0).getOpcode() != ISD::AND &&
|
|
N->getOperand(0).getOpcode() != ISD::OR &&
|
|
N->getOperand(0).getOpcode() != ISD::XOR &&
|
|
N->getOperand(0).getOpcode() != ISD::SELECT &&
|
|
N->getOperand(0).getOpcode() != ISD::SELECT_CC)
|
|
return SDValue();
|
|
|
|
SmallVector<SDValue, 4> Inputs;
|
|
SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
|
|
SmallPtrSet<SDNode *, 16> Visited;
|
|
|
|
// Visit all inputs, collect all binary operations (and, or, xor and
|
|
// select) that are all fed by truncations.
|
|
while (!BinOps.empty()) {
|
|
SDValue BinOp = BinOps.back();
|
|
BinOps.pop_back();
|
|
|
|
if (!Visited.insert(BinOp.getNode()).second)
|
|
continue;
|
|
|
|
PromOps.push_back(BinOp);
|
|
|
|
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
|
|
// The condition of the select is not promoted.
|
|
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
|
|
continue;
|
|
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
|
|
continue;
|
|
|
|
if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
|
|
isa<ConstantSDNode>(BinOp.getOperand(i))) {
|
|
Inputs.push_back(BinOp.getOperand(i));
|
|
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::OR ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
|
|
BinOps.push_back(BinOp.getOperand(i));
|
|
} else {
|
|
// We have an input that is not a truncation or another binary
|
|
// operation; we'll abort this transformation.
|
|
return SDValue();
|
|
}
|
|
}
|
|
}
|
|
|
|
// The operands of a select that must be truncated when the select is
|
|
// promoted because the operand is actually part of the to-be-promoted set.
|
|
DenseMap<SDNode *, EVT> SelectTruncOp[2];
|
|
|
|
// Make sure that this is a self-contained cluster of operations (which
|
|
// is not quite the same thing as saying that everything has only one
|
|
// use).
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
continue;
|
|
|
|
for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
|
|
UE = Inputs[i].getNode()->use_end();
|
|
UI != UE; ++UI) {
|
|
SDNode *User = *UI;
|
|
if (User != N && !Visited.count(User))
|
|
return SDValue();
|
|
|
|
// If we're going to promote the non-output-value operand(s) or SELECT or
|
|
// SELECT_CC, record them for truncation.
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
if (User->getOperand(0) == Inputs[i])
|
|
SelectTruncOp[0].insert(std::make_pair(User,
|
|
User->getOperand(0).getValueType()));
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
if (User->getOperand(0) == Inputs[i])
|
|
SelectTruncOp[0].insert(std::make_pair(User,
|
|
User->getOperand(0).getValueType()));
|
|
if (User->getOperand(1) == Inputs[i])
|
|
SelectTruncOp[1].insert(std::make_pair(User,
|
|
User->getOperand(1).getValueType()));
|
|
}
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
|
|
for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
|
|
UE = PromOps[i].getNode()->use_end();
|
|
UI != UE; ++UI) {
|
|
SDNode *User = *UI;
|
|
if (User != N && !Visited.count(User))
|
|
return SDValue();
|
|
|
|
// If we're going to promote the non-output-value operand(s) or SELECT or
|
|
// SELECT_CC, record them for truncation.
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
if (User->getOperand(0) == PromOps[i])
|
|
SelectTruncOp[0].insert(std::make_pair(User,
|
|
User->getOperand(0).getValueType()));
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
if (User->getOperand(0) == PromOps[i])
|
|
SelectTruncOp[0].insert(std::make_pair(User,
|
|
User->getOperand(0).getValueType()));
|
|
if (User->getOperand(1) == PromOps[i])
|
|
SelectTruncOp[1].insert(std::make_pair(User,
|
|
User->getOperand(1).getValueType()));
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned PromBits = N->getOperand(0).getValueSizeInBits();
|
|
bool ReallyNeedsExt = false;
|
|
if (N->getOpcode() != ISD::ANY_EXTEND) {
|
|
// If all of the inputs are not already sign/zero extended, then
|
|
// we'll still need to do that at the end.
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
continue;
|
|
|
|
unsigned OpBits =
|
|
Inputs[i].getOperand(0).getValueSizeInBits();
|
|
assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
|
|
|
|
if ((N->getOpcode() == ISD::ZERO_EXTEND &&
|
|
!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
|
|
APInt::getHighBitsSet(OpBits,
|
|
OpBits-PromBits))) ||
|
|
(N->getOpcode() == ISD::SIGN_EXTEND &&
|
|
DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
|
|
(OpBits-(PromBits-1)))) {
|
|
ReallyNeedsExt = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Replace all inputs, either with the truncation operand, or a
|
|
// truncation or extension to the final output type.
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
// Constant inputs need to be replaced with the to-be-promoted nodes that
|
|
// use them because they might have users outside of the cluster of
|
|
// promoted nodes.
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
continue;
|
|
|
|
SDValue InSrc = Inputs[i].getOperand(0);
|
|
if (Inputs[i].getValueType() == N->getValueType(0))
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
|
|
else if (N->getOpcode() == ISD::SIGN_EXTEND)
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
|
|
DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
|
|
else if (N->getOpcode() == ISD::ZERO_EXTEND)
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
|
|
DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
|
|
else
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
|
|
DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
|
|
}
|
|
|
|
std::list<HandleSDNode> PromOpHandles;
|
|
for (auto &PromOp : PromOps)
|
|
PromOpHandles.emplace_back(PromOp);
|
|
|
|
// Replace all operations (these are all the same, but have a different
|
|
// (promoted) return type). DAG.getNode will validate that the types of
|
|
// a binary operator match, so go through the list in reverse so that
|
|
// we've likely promoted both operands first.
|
|
while (!PromOpHandles.empty()) {
|
|
SDValue PromOp = PromOpHandles.back().getValue();
|
|
PromOpHandles.pop_back();
|
|
|
|
unsigned C;
|
|
switch (PromOp.getOpcode()) {
|
|
default: C = 0; break;
|
|
case ISD::SELECT: C = 1; break;
|
|
case ISD::SELECT_CC: C = 2; break;
|
|
}
|
|
|
|
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
|
|
PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
|
|
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
|
|
PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
|
|
// The to-be-promoted operands of this node have not yet been
|
|
// promoted (this should be rare because we're going through the
|
|
// list backward, but if one of the operands has several users in
|
|
// this cluster of to-be-promoted nodes, it is possible).
|
|
PromOpHandles.emplace_front(PromOp);
|
|
continue;
|
|
}
|
|
|
|
// For SELECT and SELECT_CC nodes, we do a similar check for any
|
|
// to-be-promoted comparison inputs.
|
|
if (PromOp.getOpcode() == ISD::SELECT ||
|
|
PromOp.getOpcode() == ISD::SELECT_CC) {
|
|
if ((SelectTruncOp[0].count(PromOp.getNode()) &&
|
|
PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
|
|
(SelectTruncOp[1].count(PromOp.getNode()) &&
|
|
PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
|
|
PromOpHandles.emplace_front(PromOp);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
|
|
PromOp.getNode()->op_end());
|
|
|
|
// If this node has constant inputs, then they'll need to be promoted here.
|
|
for (unsigned i = 0; i < 2; ++i) {
|
|
if (!isa<ConstantSDNode>(Ops[C+i]))
|
|
continue;
|
|
if (Ops[C+i].getValueType() == N->getValueType(0))
|
|
continue;
|
|
|
|
if (N->getOpcode() == ISD::SIGN_EXTEND)
|
|
Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
|
|
else if (N->getOpcode() == ISD::ZERO_EXTEND)
|
|
Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
|
|
else
|
|
Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
|
|
}
|
|
|
|
// If we've promoted the comparison inputs of a SELECT or SELECT_CC,
|
|
// truncate them again to the original value type.
|
|
if (PromOp.getOpcode() == ISD::SELECT ||
|
|
PromOp.getOpcode() == ISD::SELECT_CC) {
|
|
auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
|
|
if (SI0 != SelectTruncOp[0].end())
|
|
Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
|
|
auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
|
|
if (SI1 != SelectTruncOp[1].end())
|
|
Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
|
|
}
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(PromOp,
|
|
DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
|
|
}
|
|
|
|
// Now we're left with the initial extension itself.
|
|
if (!ReallyNeedsExt)
|
|
return N->getOperand(0);
|
|
|
|
// To zero extend, just mask off everything except for the first bit (in the
|
|
// i1 case).
|
|
if (N->getOpcode() == ISD::ZERO_EXTEND)
|
|
return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
|
|
DAG.getConstant(APInt::getLowBitsSet(
|
|
N->getValueSizeInBits(0), PromBits),
|
|
dl, N->getValueType(0)));
|
|
|
|
assert(N->getOpcode() == ISD::SIGN_EXTEND &&
|
|
"Invalid extension type");
|
|
EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
|
|
SDValue ShiftCst =
|
|
DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
|
|
return DAG.getNode(
|
|
ISD::SRA, dl, N->getValueType(0),
|
|
DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
|
|
ShiftCst);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineSetCC(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert(N->getOpcode() == ISD::SETCC &&
|
|
"Should be called with a SETCC node");
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
|
|
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
|
|
SDValue LHS = N->getOperand(0);
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
// If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
|
|
if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
|
|
LHS.hasOneUse())
|
|
std::swap(LHS, RHS);
|
|
|
|
// x == 0-y --> x+y == 0
|
|
// x != 0-y --> x+y != 0
|
|
if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
|
|
RHS.hasOneUse()) {
|
|
SDLoc DL(N);
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
EVT VT = N->getValueType(0);
|
|
EVT OpVT = LHS.getValueType();
|
|
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
|
|
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
|
|
}
|
|
}
|
|
|
|
return DAGCombineTruncBoolExt(N, DCI);
|
|
}
|
|
|
|
// Is this an extending load from an f32 to an f64?
|
|
static bool isFPExtLoad(SDValue Op) {
|
|
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
|
|
return LD->getExtensionType() == ISD::EXTLOAD &&
|
|
Op.getValueType() == MVT::f64;
|
|
return false;
|
|
}
|
|
|
|
/// Reduces the number of fp-to-int conversion when building a vector.
|
|
///
|
|
/// If this vector is built out of floating to integer conversions,
|
|
/// transform it to a vector built out of floating point values followed by a
|
|
/// single floating to integer conversion of the vector.
|
|
/// Namely (build_vector (fptosi $A), (fptosi $B), ...)
|
|
/// becomes (fptosi (build_vector ($A, $B, ...)))
|
|
SDValue PPCTargetLowering::
|
|
combineElementTruncationToVectorTruncation(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
|
|
"Should be called with a BUILD_VECTOR node");
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
|
|
SDValue FirstInput = N->getOperand(0);
|
|
assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
|
|
"The input operand must be an fp-to-int conversion.");
|
|
|
|
// This combine happens after legalization so the fp_to_[su]i nodes are
|
|
// already converted to PPCSISD nodes.
|
|
unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
|
|
if (FirstConversion == PPCISD::FCTIDZ ||
|
|
FirstConversion == PPCISD::FCTIDUZ ||
|
|
FirstConversion == PPCISD::FCTIWZ ||
|
|
FirstConversion == PPCISD::FCTIWUZ) {
|
|
bool IsSplat = true;
|
|
bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
|
|
FirstConversion == PPCISD::FCTIWUZ;
|
|
EVT SrcVT = FirstInput.getOperand(0).getValueType();
|
|
SmallVector<SDValue, 4> Ops;
|
|
EVT TargetVT = N->getValueType(0);
|
|
for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
|
|
SDValue NextOp = N->getOperand(i);
|
|
if (NextOp.getOpcode() != PPCISD::MFVSR)
|
|
return SDValue();
|
|
unsigned NextConversion = NextOp.getOperand(0).getOpcode();
|
|
if (NextConversion != FirstConversion)
|
|
return SDValue();
|
|
// If we are converting to 32-bit integers, we need to add an FP_ROUND.
|
|
// This is not valid if the input was originally double precision. It is
|
|
// also not profitable to do unless this is an extending load in which
|
|
// case doing this combine will allow us to combine consecutive loads.
|
|
if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
|
|
return SDValue();
|
|
if (N->getOperand(i) != FirstInput)
|
|
IsSplat = false;
|
|
}
|
|
|
|
// If this is a splat, we leave it as-is since there will be only a single
|
|
// fp-to-int conversion followed by a splat of the integer. This is better
|
|
// for 32-bit and smaller ints and neutral for 64-bit ints.
|
|
if (IsSplat)
|
|
return SDValue();
|
|
|
|
// Now that we know we have the right type of node, get its operands
|
|
for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
|
|
SDValue In = N->getOperand(i).getOperand(0);
|
|
if (Is32Bit) {
|
|
// For 32-bit values, we need to add an FP_ROUND node (if we made it
|
|
// here, we know that all inputs are extending loads so this is safe).
|
|
if (In.isUndef())
|
|
Ops.push_back(DAG.getUNDEF(SrcVT));
|
|
else {
|
|
SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl,
|
|
MVT::f32, In.getOperand(0),
|
|
DAG.getIntPtrConstant(1, dl));
|
|
Ops.push_back(Trunc);
|
|
}
|
|
} else
|
|
Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
|
|
}
|
|
|
|
unsigned Opcode;
|
|
if (FirstConversion == PPCISD::FCTIDZ ||
|
|
FirstConversion == PPCISD::FCTIWZ)
|
|
Opcode = ISD::FP_TO_SINT;
|
|
else
|
|
Opcode = ISD::FP_TO_UINT;
|
|
|
|
EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
|
|
SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
|
|
return DAG.getNode(Opcode, dl, TargetVT, BV);
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
/// Reduce the number of loads when building a vector.
|
|
///
|
|
/// Building a vector out of multiple loads can be converted to a load
|
|
/// of the vector type if the loads are consecutive. If the loads are
|
|
/// consecutive but in descending order, a shuffle is added at the end
|
|
/// to reorder the vector.
|
|
static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
|
|
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
|
|
"Should be called with a BUILD_VECTOR node");
|
|
|
|
SDLoc dl(N);
|
|
|
|
// Return early for non byte-sized type, as they can't be consecutive.
|
|
if (!N->getValueType(0).getVectorElementType().isByteSized())
|
|
return SDValue();
|
|
|
|
bool InputsAreConsecutiveLoads = true;
|
|
bool InputsAreReverseConsecutive = true;
|
|
unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
|
|
SDValue FirstInput = N->getOperand(0);
|
|
bool IsRoundOfExtLoad = false;
|
|
|
|
if (FirstInput.getOpcode() == ISD::FP_ROUND &&
|
|
FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
|
|
LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0));
|
|
IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD;
|
|
}
|
|
// Not a build vector of (possibly fp_rounded) loads.
|
|
if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
|
|
N->getNumOperands() == 1)
|
|
return SDValue();
|
|
|
|
for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
|
|
// If any inputs are fp_round(extload), they all must be.
|
|
if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
|
|
return SDValue();
|
|
|
|
SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
|
|
N->getOperand(i);
|
|
if (NextInput.getOpcode() != ISD::LOAD)
|
|
return SDValue();
|
|
|
|
SDValue PreviousInput =
|
|
IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
|
|
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput);
|
|
LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput);
|
|
|
|
// If any inputs are fp_round(extload), they all must be.
|
|
if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
|
|
return SDValue();
|
|
|
|
if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG))
|
|
InputsAreConsecutiveLoads = false;
|
|
if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG))
|
|
InputsAreReverseConsecutive = false;
|
|
|
|
// Exit early if the loads are neither consecutive nor reverse consecutive.
|
|
if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
|
|
return SDValue();
|
|
}
|
|
|
|
assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
|
|
"The loads cannot be both consecutive and reverse consecutive.");
|
|
|
|
SDValue FirstLoadOp =
|
|
IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput;
|
|
SDValue LastLoadOp =
|
|
IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) :
|
|
N->getOperand(N->getNumOperands()-1);
|
|
|
|
LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp);
|
|
LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp);
|
|
if (InputsAreConsecutiveLoads) {
|
|
assert(LD1 && "Input needs to be a LoadSDNode.");
|
|
return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(),
|
|
LD1->getBasePtr(), LD1->getPointerInfo(),
|
|
LD1->getAlignment());
|
|
}
|
|
if (InputsAreReverseConsecutive) {
|
|
assert(LDL && "Input needs to be a LoadSDNode.");
|
|
SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(),
|
|
LDL->getBasePtr(), LDL->getPointerInfo(),
|
|
LDL->getAlignment());
|
|
SmallVector<int, 16> Ops;
|
|
for (int i = N->getNumOperands() - 1; i >= 0; i--)
|
|
Ops.push_back(i);
|
|
|
|
return DAG.getVectorShuffle(N->getValueType(0), dl, Load,
|
|
DAG.getUNDEF(N->getValueType(0)), Ops);
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
// This function adds the required vector_shuffle needed to get
|
|
// the elements of the vector extract in the correct position
|
|
// as specified by the CorrectElems encoding.
|
|
static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
|
|
SDValue Input, uint64_t Elems,
|
|
uint64_t CorrectElems) {
|
|
SDLoc dl(N);
|
|
|
|
unsigned NumElems = Input.getValueType().getVectorNumElements();
|
|
SmallVector<int, 16> ShuffleMask(NumElems, -1);
|
|
|
|
// Knowing the element indices being extracted from the original
|
|
// vector and the order in which they're being inserted, just put
|
|
// them at element indices required for the instruction.
|
|
for (unsigned i = 0; i < N->getNumOperands(); i++) {
|
|
if (DAG.getDataLayout().isLittleEndian())
|
|
ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
|
|
else
|
|
ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
|
|
CorrectElems = CorrectElems >> 8;
|
|
Elems = Elems >> 8;
|
|
}
|
|
|
|
SDValue Shuffle =
|
|
DAG.getVectorShuffle(Input.getValueType(), dl, Input,
|
|
DAG.getUNDEF(Input.getValueType()), ShuffleMask);
|
|
|
|
EVT Ty = N->getValueType(0);
|
|
SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
|
|
return BV;
|
|
}
|
|
|
|
// Look for build vector patterns where input operands come from sign
|
|
// extended vector_extract elements of specific indices. If the correct indices
|
|
// aren't used, add a vector shuffle to fix up the indices and create a new
|
|
// PPCISD:SExtVElems node which selects the vector sign extend instructions
|
|
// during instruction selection.
|
|
static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
|
|
// This array encodes the indices that the vector sign extend instructions
|
|
// extract from when extending from one type to another for both BE and LE.
|
|
// The right nibble of each byte corresponds to the LE incides.
|
|
// and the left nibble of each byte corresponds to the BE incides.
|
|
// For example: 0x3074B8FC byte->word
|
|
// For LE: the allowed indices are: 0x0,0x4,0x8,0xC
|
|
// For BE: the allowed indices are: 0x3,0x7,0xB,0xF
|
|
// For example: 0x000070F8 byte->double word
|
|
// For LE: the allowed indices are: 0x0,0x8
|
|
// For BE: the allowed indices are: 0x7,0xF
|
|
uint64_t TargetElems[] = {
|
|
0x3074B8FC, // b->w
|
|
0x000070F8, // b->d
|
|
0x10325476, // h->w
|
|
0x00003074, // h->d
|
|
0x00001032, // w->d
|
|
};
|
|
|
|
uint64_t Elems = 0;
|
|
int Index;
|
|
SDValue Input;
|
|
|
|
auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
|
|
if (!Op)
|
|
return false;
|
|
if (Op.getOpcode() != ISD::SIGN_EXTEND &&
|
|
Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
|
|
return false;
|
|
|
|
// A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
|
|
// of the right width.
|
|
SDValue Extract = Op.getOperand(0);
|
|
if (Extract.getOpcode() == ISD::ANY_EXTEND)
|
|
Extract = Extract.getOperand(0);
|
|
if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
|
|
return false;
|
|
|
|
ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
|
|
if (!ExtOp)
|
|
return false;
|
|
|
|
Index = ExtOp->getZExtValue();
|
|
if (Input && Input != Extract.getOperand(0))
|
|
return false;
|
|
|
|
if (!Input)
|
|
Input = Extract.getOperand(0);
|
|
|
|
Elems = Elems << 8;
|
|
Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
|
|
Elems |= Index;
|
|
|
|
return true;
|
|
};
|
|
|
|
// If the build vector operands aren't sign extended vector extracts,
|
|
// of the same input vector, then return.
|
|
for (unsigned i = 0; i < N->getNumOperands(); i++) {
|
|
if (!isSExtOfVecExtract(N->getOperand(i))) {
|
|
return SDValue();
|
|
}
|
|
}
|
|
|
|
// If the vector extract indicies are not correct, add the appropriate
|
|
// vector_shuffle.
|
|
int TgtElemArrayIdx;
|
|
int InputSize = Input.getValueType().getScalarSizeInBits();
|
|
int OutputSize = N->getValueType(0).getScalarSizeInBits();
|
|
if (InputSize + OutputSize == 40)
|
|
TgtElemArrayIdx = 0;
|
|
else if (InputSize + OutputSize == 72)
|
|
TgtElemArrayIdx = 1;
|
|
else if (InputSize + OutputSize == 48)
|
|
TgtElemArrayIdx = 2;
|
|
else if (InputSize + OutputSize == 80)
|
|
TgtElemArrayIdx = 3;
|
|
else if (InputSize + OutputSize == 96)
|
|
TgtElemArrayIdx = 4;
|
|
else
|
|
return SDValue();
|
|
|
|
uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
|
|
CorrectElems = DAG.getDataLayout().isLittleEndian()
|
|
? CorrectElems & 0x0F0F0F0F0F0F0F0F
|
|
: CorrectElems & 0xF0F0F0F0F0F0F0F0;
|
|
if (Elems != CorrectElems) {
|
|
return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
|
|
}
|
|
|
|
// Regular lowering will catch cases where a shuffle is not needed.
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert(N->getOpcode() == ISD::BUILD_VECTOR &&
|
|
"Should be called with a BUILD_VECTOR node");
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
|
|
if (!Subtarget.hasVSX())
|
|
return SDValue();
|
|
|
|
// The target independent DAG combiner will leave a build_vector of
|
|
// float-to-int conversions intact. We can generate MUCH better code for
|
|
// a float-to-int conversion of a vector of floats.
|
|
SDValue FirstInput = N->getOperand(0);
|
|
if (FirstInput.getOpcode() == PPCISD::MFVSR) {
|
|
SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
|
|
if (Reduced)
|
|
return Reduced;
|
|
}
|
|
|
|
// If we're building a vector out of consecutive loads, just load that
|
|
// vector type.
|
|
SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
|
|
if (Reduced)
|
|
return Reduced;
|
|
|
|
// If we're building a vector out of extended elements from another vector
|
|
// we have P9 vector integer extend instructions. The code assumes legal
|
|
// input types (i.e. it can't handle things like v4i16) so do not run before
|
|
// legalization.
|
|
if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
|
|
Reduced = combineBVOfVecSExt(N, DAG);
|
|
if (Reduced)
|
|
return Reduced;
|
|
}
|
|
|
|
|
|
if (N->getValueType(0) != MVT::v2f64)
|
|
return SDValue();
|
|
|
|
// Looking for:
|
|
// (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
|
|
if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
|
|
FirstInput.getOpcode() != ISD::UINT_TO_FP)
|
|
return SDValue();
|
|
if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
|
|
N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
|
|
return SDValue();
|
|
if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
|
|
return SDValue();
|
|
|
|
SDValue Ext1 = FirstInput.getOperand(0);
|
|
SDValue Ext2 = N->getOperand(1).getOperand(0);
|
|
if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
|
|
Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
|
|
return SDValue();
|
|
|
|
ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
|
|
ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
|
|
if (!Ext1Op || !Ext2Op)
|
|
return SDValue();
|
|
if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||
|
|
Ext1.getOperand(0) != Ext2.getOperand(0))
|
|
return SDValue();
|
|
|
|
int FirstElem = Ext1Op->getZExtValue();
|
|
int SecondElem = Ext2Op->getZExtValue();
|
|
int SubvecIdx;
|
|
if (FirstElem == 0 && SecondElem == 1)
|
|
SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
|
|
else if (FirstElem == 2 && SecondElem == 3)
|
|
SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
|
|
else
|
|
return SDValue();
|
|
|
|
SDValue SrcVec = Ext1.getOperand(0);
|
|
auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
|
|
PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
|
|
return DAG.getNode(NodeType, dl, MVT::v2f64,
|
|
SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert((N->getOpcode() == ISD::SINT_TO_FP ||
|
|
N->getOpcode() == ISD::UINT_TO_FP) &&
|
|
"Need an int -> FP conversion node here");
|
|
|
|
if (useSoftFloat() || !Subtarget.has64BitSupport())
|
|
return SDValue();
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
SDValue Op(N, 0);
|
|
|
|
// Don't handle ppc_fp128 here or conversions that are out-of-range capable
|
|
// from the hardware.
|
|
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
|
|
return SDValue();
|
|
if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
|
|
Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
|
|
return SDValue();
|
|
|
|
SDValue FirstOperand(Op.getOperand(0));
|
|
bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
|
|
(FirstOperand.getValueType() == MVT::i8 ||
|
|
FirstOperand.getValueType() == MVT::i16);
|
|
if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
|
|
bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
|
|
bool DstDouble = Op.getValueType() == MVT::f64;
|
|
unsigned ConvOp = Signed ?
|
|
(DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
|
|
(DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
|
|
SDValue WidthConst =
|
|
DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
|
|
dl, false);
|
|
LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
|
|
SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
|
|
SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
|
|
DAG.getVTList(MVT::f64, MVT::Other),
|
|
Ops, MVT::i8, LDN->getMemOperand());
|
|
|
|
// For signed conversion, we need to sign-extend the value in the VSR
|
|
if (Signed) {
|
|
SDValue ExtOps[] = { Ld, WidthConst };
|
|
SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
|
|
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
|
|
} else
|
|
return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
|
|
}
|
|
|
|
|
|
// For i32 intermediate values, unfortunately, the conversion functions
|
|
// leave the upper 32 bits of the value are undefined. Within the set of
|
|
// scalar instructions, we have no method for zero- or sign-extending the
|
|
// value. Thus, we cannot handle i32 intermediate values here.
|
|
if (Op.getOperand(0).getValueType() == MVT::i32)
|
|
return SDValue();
|
|
|
|
assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
|
|
"UINT_TO_FP is supported only with FPCVT");
|
|
|
|
// If we have FCFIDS, then use it when converting to single-precision.
|
|
// Otherwise, convert to double-precision and then round.
|
|
unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
|
|
? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
|
|
: PPCISD::FCFIDS)
|
|
: (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
|
|
: PPCISD::FCFID);
|
|
MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
|
|
? MVT::f32
|
|
: MVT::f64;
|
|
|
|
// If we're converting from a float, to an int, and back to a float again,
|
|
// then we don't need the store/load pair at all.
|
|
if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
|
|
Subtarget.hasFPCVT()) ||
|
|
(Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
|
|
SDValue Src = Op.getOperand(0).getOperand(0);
|
|
if (Src.getValueType() == MVT::f32) {
|
|
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
|
|
DCI.AddToWorklist(Src.getNode());
|
|
} else if (Src.getValueType() != MVT::f64) {
|
|
// Make sure that we don't pick up a ppc_fp128 source value.
|
|
return SDValue();
|
|
}
|
|
|
|
unsigned FCTOp =
|
|
Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
|
|
PPCISD::FCTIDUZ;
|
|
|
|
SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
|
|
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
|
|
|
|
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
|
|
FP = DAG.getNode(ISD::FP_ROUND, dl,
|
|
MVT::f32, FP, DAG.getIntPtrConstant(0, dl));
|
|
DCI.AddToWorklist(FP.getNode());
|
|
}
|
|
|
|
return FP;
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
|
|
// builtins) into loads with swaps.
|
|
SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
SDValue Chain;
|
|
SDValue Base;
|
|
MachineMemOperand *MMO;
|
|
|
|
switch (N->getOpcode()) {
|
|
default:
|
|
llvm_unreachable("Unexpected opcode for little endian VSX load");
|
|
case ISD::LOAD: {
|
|
LoadSDNode *LD = cast<LoadSDNode>(N);
|
|
Chain = LD->getChain();
|
|
Base = LD->getBasePtr();
|
|
MMO = LD->getMemOperand();
|
|
// If the MMO suggests this isn't a load of a full vector, leave
|
|
// things alone. For a built-in, we have to make the change for
|
|
// correctness, so if there is a size problem that will be a bug.
|
|
if (MMO->getSize() < 16)
|
|
return SDValue();
|
|
break;
|
|
}
|
|
case ISD::INTRINSIC_W_CHAIN: {
|
|
MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
|
|
Chain = Intrin->getChain();
|
|
// Similarly to the store case below, Intrin->getBasePtr() doesn't get
|
|
// us what we want. Get operand 2 instead.
|
|
Base = Intrin->getOperand(2);
|
|
MMO = Intrin->getMemOperand();
|
|
break;
|
|
}
|
|
}
|
|
|
|
MVT VecTy = N->getValueType(0).getSimpleVT();
|
|
|
|
// Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
|
|
// aligned and the type is a vector with elements up to 4 bytes
|
|
if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
|
|
&& VecTy.getScalarSizeInBits() <= 32 ) {
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue LoadOps[] = { Chain, Base };
|
|
SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
|
|
DAG.getVTList(MVT::v2f64, MVT::Other),
|
|
LoadOps, MVT::v2f64, MMO);
|
|
|
|
DCI.AddToWorklist(Load.getNode());
|
|
Chain = Load.getValue(1);
|
|
SDValue Swap = DAG.getNode(
|
|
PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
|
|
DCI.AddToWorklist(Swap.getNode());
|
|
|
|
// Add a bitcast if the resulting load type doesn't match v2f64.
|
|
if (VecTy != MVT::v2f64) {
|
|
SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
|
|
DCI.AddToWorklist(N.getNode());
|
|
// Package {bitcast value, swap's chain} to match Load's shape.
|
|
return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
|
|
N, Swap.getValue(1));
|
|
}
|
|
|
|
return Swap;
|
|
}
|
|
|
|
// expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
|
|
// builtins) into stores with swaps.
|
|
SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
SDValue Chain;
|
|
SDValue Base;
|
|
unsigned SrcOpnd;
|
|
MachineMemOperand *MMO;
|
|
|
|
switch (N->getOpcode()) {
|
|
default:
|
|
llvm_unreachable("Unexpected opcode for little endian VSX store");
|
|
case ISD::STORE: {
|
|
StoreSDNode *ST = cast<StoreSDNode>(N);
|
|
Chain = ST->getChain();
|
|
Base = ST->getBasePtr();
|
|
MMO = ST->getMemOperand();
|
|
SrcOpnd = 1;
|
|
// If the MMO suggests this isn't a store of a full vector, leave
|
|
// things alone. For a built-in, we have to make the change for
|
|
// correctness, so if there is a size problem that will be a bug.
|
|
if (MMO->getSize() < 16)
|
|
return SDValue();
|
|
break;
|
|
}
|
|
case ISD::INTRINSIC_VOID: {
|
|
MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
|
|
Chain = Intrin->getChain();
|
|
// Intrin->getBasePtr() oddly does not get what we want.
|
|
Base = Intrin->getOperand(3);
|
|
MMO = Intrin->getMemOperand();
|
|
SrcOpnd = 2;
|
|
break;
|
|
}
|
|
}
|
|
|
|
SDValue Src = N->getOperand(SrcOpnd);
|
|
MVT VecTy = Src.getValueType().getSimpleVT();
|
|
|
|
// Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
|
|
// aligned and the type is a vector with elements up to 4 bytes
|
|
if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
|
|
&& VecTy.getScalarSizeInBits() <= 32 ) {
|
|
return SDValue();
|
|
}
|
|
|
|
// All stores are done as v2f64 and possible bit cast.
|
|
if (VecTy != MVT::v2f64) {
|
|
Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
|
|
DCI.AddToWorklist(Src.getNode());
|
|
}
|
|
|
|
SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
|
|
DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
|
|
DCI.AddToWorklist(Swap.getNode());
|
|
Chain = Swap.getValue(1);
|
|
SDValue StoreOps[] = { Chain, Swap, Base };
|
|
SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
|
|
DAG.getVTList(MVT::Other),
|
|
StoreOps, VecTy, MMO);
|
|
DCI.AddToWorklist(Store.getNode());
|
|
return Store;
|
|
}
|
|
|
|
// Handle DAG combine for STORE (FP_TO_INT F).
|
|
SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
unsigned Opcode = N->getOperand(1).getOpcode();
|
|
|
|
assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT)
|
|
&& "Not a FP_TO_INT Instruction!");
|
|
|
|
SDValue Val = N->getOperand(1).getOperand(0);
|
|
EVT Op1VT = N->getOperand(1).getValueType();
|
|
EVT ResVT = Val.getValueType();
|
|
|
|
// Floating point types smaller than 32 bits are not legal on Power.
|
|
if (ResVT.getScalarSizeInBits() < 32)
|
|
return SDValue();
|
|
|
|
// Only perform combine for conversion to i64/i32 or power9 i16/i8.
|
|
bool ValidTypeForStoreFltAsInt =
|
|
(Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
|
|
(Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
|
|
|
|
if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() ||
|
|
cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
|
|
return SDValue();
|
|
|
|
// Extend f32 values to f64
|
|
if (ResVT.getScalarSizeInBits() == 32) {
|
|
Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
|
|
DCI.AddToWorklist(Val.getNode());
|
|
}
|
|
|
|
// Set signed or unsigned conversion opcode.
|
|
unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ?
|
|
PPCISD::FP_TO_SINT_IN_VSR :
|
|
PPCISD::FP_TO_UINT_IN_VSR;
|
|
|
|
Val = DAG.getNode(ConvOpcode,
|
|
dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val);
|
|
DCI.AddToWorklist(Val.getNode());
|
|
|
|
// Set number of bytes being converted.
|
|
unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
|
|
SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2),
|
|
DAG.getIntPtrConstant(ByteSize, dl, false),
|
|
DAG.getValueType(Op1VT) };
|
|
|
|
Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
|
|
DAG.getVTList(MVT::Other), Ops,
|
|
cast<StoreSDNode>(N)->getMemoryVT(),
|
|
cast<StoreSDNode>(N)->getMemOperand());
|
|
|
|
DCI.AddToWorklist(Val.getNode());
|
|
return Val;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
|
|
LSBaseSDNode *LSBase,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
|
|
"Not a reverse memop pattern!");
|
|
|
|
auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
|
|
auto Mask = SVN->getMask();
|
|
int i = 0;
|
|
auto I = Mask.rbegin();
|
|
auto E = Mask.rend();
|
|
|
|
for (; I != E; ++I) {
|
|
if (*I != i)
|
|
return false;
|
|
i++;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
EVT VT = SVN->getValueType(0);
|
|
|
|
if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
|
|
return SDValue();
|
|
|
|
// Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
|
|
// See comment in PPCVSXSwapRemoval.cpp.
|
|
// It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
|
|
if (!Subtarget.hasP9Vector())
|
|
return SDValue();
|
|
|
|
if(!IsElementReverse(SVN))
|
|
return SDValue();
|
|
|
|
if (LSBase->getOpcode() == ISD::LOAD) {
|
|
SDLoc dl(SVN);
|
|
SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
|
|
return DAG.getMemIntrinsicNode(
|
|
PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
|
|
LSBase->getMemoryVT(), LSBase->getMemOperand());
|
|
}
|
|
|
|
if (LSBase->getOpcode() == ISD::STORE) {
|
|
SDLoc dl(LSBase);
|
|
SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
|
|
LSBase->getBasePtr()};
|
|
return DAG.getMemIntrinsicNode(
|
|
PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
|
|
LSBase->getMemoryVT(), LSBase->getMemOperand());
|
|
}
|
|
|
|
llvm_unreachable("Expected a load or store node here");
|
|
}
|
|
|
|
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
switch (N->getOpcode()) {
|
|
default: break;
|
|
case ISD::ADD:
|
|
return combineADD(N, DCI);
|
|
case ISD::SHL:
|
|
return combineSHL(N, DCI);
|
|
case ISD::SRA:
|
|
return combineSRA(N, DCI);
|
|
case ISD::SRL:
|
|
return combineSRL(N, DCI);
|
|
case ISD::MUL:
|
|
return combineMUL(N, DCI);
|
|
case PPCISD::SHL:
|
|
if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
|
|
return N->getOperand(0);
|
|
break;
|
|
case PPCISD::SRL:
|
|
if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
|
|
return N->getOperand(0);
|
|
break;
|
|
case PPCISD::SRA:
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
|
|
if (C->isNullValue() || // 0 >>s V -> 0.
|
|
C->isAllOnesValue()) // -1 >>s V -> -1.
|
|
return N->getOperand(0);
|
|
}
|
|
break;
|
|
case ISD::SIGN_EXTEND:
|
|
case ISD::ZERO_EXTEND:
|
|
case ISD::ANY_EXTEND:
|
|
return DAGCombineExtBoolTrunc(N, DCI);
|
|
case ISD::TRUNCATE:
|
|
return combineTRUNCATE(N, DCI);
|
|
case ISD::SETCC:
|
|
if (SDValue CSCC = combineSetCC(N, DCI))
|
|
return CSCC;
|
|
LLVM_FALLTHROUGH;
|
|
case ISD::SELECT_CC:
|
|
return DAGCombineTruncBoolExt(N, DCI);
|
|
case ISD::SINT_TO_FP:
|
|
case ISD::UINT_TO_FP:
|
|
return combineFPToIntToFP(N, DCI);
|
|
case ISD::VECTOR_SHUFFLE:
|
|
if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
|
|
LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
|
|
return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
|
|
}
|
|
break;
|
|
case ISD::STORE: {
|
|
|
|
EVT Op1VT = N->getOperand(1).getValueType();
|
|
unsigned Opcode = N->getOperand(1).getOpcode();
|
|
|
|
if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) {
|
|
SDValue Val= combineStoreFPToInt(N, DCI);
|
|
if (Val)
|
|
return Val;
|
|
}
|
|
|
|
if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
|
|
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
|
|
SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
|
|
if (Val)
|
|
return Val;
|
|
}
|
|
|
|
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
|
|
if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
|
|
N->getOperand(1).getNode()->hasOneUse() &&
|
|
(Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
|
|
(Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
|
|
|
|
// STBRX can only handle simple types and it makes no sense to store less
|
|
// two bytes in byte-reversed order.
|
|
EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
|
|
if (mVT.isExtended() || mVT.getSizeInBits() < 16)
|
|
break;
|
|
|
|
SDValue BSwapOp = N->getOperand(1).getOperand(0);
|
|
// Do an any-extend to 32-bits if this is a half-word input.
|
|
if (BSwapOp.getValueType() == MVT::i16)
|
|
BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
|
|
|
|
// If the type of BSWAP operand is wider than stored memory width
|
|
// it need to be shifted to the right side before STBRX.
|
|
if (Op1VT.bitsGT(mVT)) {
|
|
int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
|
|
BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
|
|
DAG.getConstant(Shift, dl, MVT::i32));
|
|
// Need to truncate if this is a bswap of i64 stored as i32/i16.
|
|
if (Op1VT == MVT::i64)
|
|
BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
|
|
}
|
|
|
|
SDValue Ops[] = {
|
|
N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
|
|
};
|
|
return
|
|
DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
|
|
Ops, cast<StoreSDNode>(N)->getMemoryVT(),
|
|
cast<StoreSDNode>(N)->getMemOperand());
|
|
}
|
|
|
|
// STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
|
|
// So it can increase the chance of CSE constant construction.
|
|
if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
|
|
isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
|
|
// Need to sign-extended to 64-bits to handle negative values.
|
|
EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
|
|
uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
|
|
MemVT.getSizeInBits());
|
|
SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
|
|
|
|
// DAG.getTruncStore() can't be used here because it doesn't accept
|
|
// the general (base + offset) addressing mode.
|
|
// So we use UpdateNodeOperands and setTruncatingStore instead.
|
|
DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
|
|
N->getOperand(3));
|
|
cast<StoreSDNode>(N)->setTruncatingStore(true);
|
|
return SDValue(N, 0);
|
|
}
|
|
|
|
// For little endian, VSX stores require generating xxswapd/lxvd2x.
|
|
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
|
|
if (Op1VT.isSimple()) {
|
|
MVT StoreVT = Op1VT.getSimpleVT();
|
|
if (Subtarget.needsSwapsForVSXMemOps() &&
|
|
(StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
|
|
StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
|
|
return expandVSXStoreForLE(N, DCI);
|
|
}
|
|
break;
|
|
}
|
|
case ISD::LOAD: {
|
|
LoadSDNode *LD = cast<LoadSDNode>(N);
|
|
EVT VT = LD->getValueType(0);
|
|
|
|
// For little endian, VSX loads require generating lxvd2x/xxswapd.
|
|
// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
|
|
if (VT.isSimple()) {
|
|
MVT LoadVT = VT.getSimpleVT();
|
|
if (Subtarget.needsSwapsForVSXMemOps() &&
|
|
(LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
|
|
LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
|
|
return expandVSXLoadForLE(N, DCI);
|
|
}
|
|
|
|
// We sometimes end up with a 64-bit integer load, from which we extract
|
|
// two single-precision floating-point numbers. This happens with
|
|
// std::complex<float>, and other similar structures, because of the way we
|
|
// canonicalize structure copies. However, if we lack direct moves,
|
|
// then the final bitcasts from the extracted integer values to the
|
|
// floating-point numbers turn into store/load pairs. Even with direct moves,
|
|
// just loading the two floating-point numbers is likely better.
|
|
auto ReplaceTwoFloatLoad = [&]() {
|
|
if (VT != MVT::i64)
|
|
return false;
|
|
|
|
if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
|
|
LD->isVolatile())
|
|
return false;
|
|
|
|
// We're looking for a sequence like this:
|
|
// t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
|
|
// t16: i64 = srl t13, Constant:i32<32>
|
|
// t17: i32 = truncate t16
|
|
// t18: f32 = bitcast t17
|
|
// t19: i32 = truncate t13
|
|
// t20: f32 = bitcast t19
|
|
|
|
if (!LD->hasNUsesOfValue(2, 0))
|
|
return false;
|
|
|
|
auto UI = LD->use_begin();
|
|
while (UI.getUse().getResNo() != 0) ++UI;
|
|
SDNode *Trunc = *UI++;
|
|
while (UI.getUse().getResNo() != 0) ++UI;
|
|
SDNode *RightShift = *UI;
|
|
if (Trunc->getOpcode() != ISD::TRUNCATE)
|
|
std::swap(Trunc, RightShift);
|
|
|
|
if (Trunc->getOpcode() != ISD::TRUNCATE ||
|
|
Trunc->getValueType(0) != MVT::i32 ||
|
|
!Trunc->hasOneUse())
|
|
return false;
|
|
if (RightShift->getOpcode() != ISD::SRL ||
|
|
!isa<ConstantSDNode>(RightShift->getOperand(1)) ||
|
|
RightShift->getConstantOperandVal(1) != 32 ||
|
|
!RightShift->hasOneUse())
|
|
return false;
|
|
|
|
SDNode *Trunc2 = *RightShift->use_begin();
|
|
if (Trunc2->getOpcode() != ISD::TRUNCATE ||
|
|
Trunc2->getValueType(0) != MVT::i32 ||
|
|
!Trunc2->hasOneUse())
|
|
return false;
|
|
|
|
SDNode *Bitcast = *Trunc->use_begin();
|
|
SDNode *Bitcast2 = *Trunc2->use_begin();
|
|
|
|
if (Bitcast->getOpcode() != ISD::BITCAST ||
|
|
Bitcast->getValueType(0) != MVT::f32)
|
|
return false;
|
|
if (Bitcast2->getOpcode() != ISD::BITCAST ||
|
|
Bitcast2->getValueType(0) != MVT::f32)
|
|
return false;
|
|
|
|
if (Subtarget.isLittleEndian())
|
|
std::swap(Bitcast, Bitcast2);
|
|
|
|
// Bitcast has the second float (in memory-layout order) and Bitcast2
|
|
// has the first one.
|
|
|
|
SDValue BasePtr = LD->getBasePtr();
|
|
if (LD->isIndexed()) {
|
|
assert(LD->getAddressingMode() == ISD::PRE_INC &&
|
|
"Non-pre-inc AM on PPC?");
|
|
BasePtr =
|
|
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
|
|
LD->getOffset());
|
|
}
|
|
|
|
auto MMOFlags =
|
|
LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
|
|
SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
|
|
LD->getPointerInfo(), LD->getAlignment(),
|
|
MMOFlags, LD->getAAInfo());
|
|
SDValue AddPtr =
|
|
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
|
|
BasePtr, DAG.getIntPtrConstant(4, dl));
|
|
SDValue FloatLoad2 = DAG.getLoad(
|
|
MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
|
|
LD->getPointerInfo().getWithOffset(4),
|
|
MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo());
|
|
|
|
if (LD->isIndexed()) {
|
|
// Note that DAGCombine should re-form any pre-increment load(s) from
|
|
// what is produced here if that makes sense.
|
|
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
|
|
}
|
|
|
|
DCI.CombineTo(Bitcast2, FloatLoad);
|
|
DCI.CombineTo(Bitcast, FloatLoad2);
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
|
|
SDValue(FloatLoad2.getNode(), 1));
|
|
return true;
|
|
};
|
|
|
|
if (ReplaceTwoFloatLoad())
|
|
return SDValue(N, 0);
|
|
|
|
EVT MemVT = LD->getMemoryVT();
|
|
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
|
|
unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
|
|
Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
|
|
unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
|
|
if (LD->isUnindexed() && VT.isVector() &&
|
|
((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
|
|
// P8 and later hardware should just use LOAD.
|
|
!Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
|
|
VT == MVT::v4i32 || VT == MVT::v4f32)) ||
|
|
(Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
|
|
LD->getAlignment() >= ScalarABIAlignment)) &&
|
|
LD->getAlignment() < ABIAlignment) {
|
|
// This is a type-legal unaligned Altivec or QPX load.
|
|
SDValue Chain = LD->getChain();
|
|
SDValue Ptr = LD->getBasePtr();
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
|
|
// This implements the loading of unaligned vectors as described in
|
|
// the venerable Apple Velocity Engine overview. Specifically:
|
|
// https://developer.apple.com/hardwaredrivers/ve/alignment.html
|
|
// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
|
|
//
|
|
// The general idea is to expand a sequence of one or more unaligned
|
|
// loads into an alignment-based permutation-control instruction (lvsl
|
|
// or lvsr), a series of regular vector loads (which always truncate
|
|
// their input address to an aligned address), and a series of
|
|
// permutations. The results of these permutations are the requested
|
|
// loaded values. The trick is that the last "extra" load is not taken
|
|
// from the address you might suspect (sizeof(vector) bytes after the
|
|
// last requested load), but rather sizeof(vector) - 1 bytes after the
|
|
// last requested vector. The point of this is to avoid a page fault if
|
|
// the base address happened to be aligned. This works because if the
|
|
// base address is aligned, then adding less than a full vector length
|
|
// will cause the last vector in the sequence to be (re)loaded.
|
|
// Otherwise, the next vector will be fetched as you might suspect was
|
|
// necessary.
|
|
|
|
// We might be able to reuse the permutation generation from
|
|
// a different base address offset from this one by an aligned amount.
|
|
// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
|
|
// optimization later.
|
|
Intrinsic::ID Intr, IntrLD, IntrPerm;
|
|
MVT PermCntlTy, PermTy, LDTy;
|
|
if (Subtarget.hasAltivec()) {
|
|
Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr :
|
|
Intrinsic::ppc_altivec_lvsl;
|
|
IntrLD = Intrinsic::ppc_altivec_lvx;
|
|
IntrPerm = Intrinsic::ppc_altivec_vperm;
|
|
PermCntlTy = MVT::v16i8;
|
|
PermTy = MVT::v4i32;
|
|
LDTy = MVT::v4i32;
|
|
} else {
|
|
Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld :
|
|
Intrinsic::ppc_qpx_qvlpcls;
|
|
IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd :
|
|
Intrinsic::ppc_qpx_qvlfs;
|
|
IntrPerm = Intrinsic::ppc_qpx_qvfperm;
|
|
PermCntlTy = MVT::v4f64;
|
|
PermTy = MVT::v4f64;
|
|
LDTy = MemVT.getSimpleVT();
|
|
}
|
|
|
|
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
|
|
|
|
// Create the new MMO for the new base load. It is like the original MMO,
|
|
// but represents an area in memory almost twice the vector size centered
|
|
// on the original address. If the address is unaligned, we might start
|
|
// reading up to (sizeof(vector)-1) bytes below the address of the
|
|
// original unaligned load.
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineMemOperand *BaseMMO =
|
|
MF.getMachineMemOperand(LD->getMemOperand(),
|
|
-(long)MemVT.getStoreSize()+1,
|
|
2*MemVT.getStoreSize()-1);
|
|
|
|
// Create the new base load.
|
|
SDValue LDXIntID =
|
|
DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
|
|
SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
|
|
SDValue BaseLoad =
|
|
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
|
|
DAG.getVTList(PermTy, MVT::Other),
|
|
BaseLoadOps, LDTy, BaseMMO);
|
|
|
|
// Note that the value of IncOffset (which is provided to the next
|
|
// load's pointer info offset value, and thus used to calculate the
|
|
// alignment), and the value of IncValue (which is actually used to
|
|
// increment the pointer value) are different! This is because we
|
|
// require the next load to appear to be aligned, even though it
|
|
// is actually offset from the base pointer by a lesser amount.
|
|
int IncOffset = VT.getSizeInBits() / 8;
|
|
int IncValue = IncOffset;
|
|
|
|
// Walk (both up and down) the chain looking for another load at the real
|
|
// (aligned) offset (the alignment of the other load does not matter in
|
|
// this case). If found, then do not use the offset reduction trick, as
|
|
// that will prevent the loads from being later combined (as they would
|
|
// otherwise be duplicates).
|
|
if (!findConsecutiveLoad(LD, DAG))
|
|
--IncValue;
|
|
|
|
SDValue Increment =
|
|
DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
|
|
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
|
|
|
|
MachineMemOperand *ExtraMMO =
|
|
MF.getMachineMemOperand(LD->getMemOperand(),
|
|
1, 2*MemVT.getStoreSize()-1);
|
|
SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
|
|
SDValue ExtraLoad =
|
|
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
|
|
DAG.getVTList(PermTy, MVT::Other),
|
|
ExtraLoadOps, LDTy, ExtraMMO);
|
|
|
|
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
|
BaseLoad.getValue(1), ExtraLoad.getValue(1));
|
|
|
|
// Because vperm has a big-endian bias, we must reverse the order
|
|
// of the input vectors and complement the permute control vector
|
|
// when generating little endian code. We have already handled the
|
|
// latter by using lvsr instead of lvsl, so just reverse BaseLoad
|
|
// and ExtraLoad here.
|
|
SDValue Perm;
|
|
if (isLittleEndian)
|
|
Perm = BuildIntrinsicOp(IntrPerm,
|
|
ExtraLoad, BaseLoad, PermCntl, DAG, dl);
|
|
else
|
|
Perm = BuildIntrinsicOp(IntrPerm,
|
|
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
|
|
|
|
if (VT != PermTy)
|
|
Perm = Subtarget.hasAltivec() ?
|
|
DAG.getNode(ISD::BITCAST, dl, VT, Perm) :
|
|
DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX
|
|
DAG.getTargetConstant(1, dl, MVT::i64));
|
|
// second argument is 1 because this rounding
|
|
// is always exact.
|
|
|
|
// The output of the permutation is our loaded result, the TokenFactor is
|
|
// our new chain.
|
|
DCI.CombineTo(N, Perm, TF);
|
|
return SDValue(N, 0);
|
|
}
|
|
}
|
|
break;
|
|
case ISD::INTRINSIC_WO_CHAIN: {
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
|
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
|
|
Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
|
|
: Intrinsic::ppc_altivec_lvsl);
|
|
if ((IID == Intr ||
|
|
IID == Intrinsic::ppc_qpx_qvlpcld ||
|
|
IID == Intrinsic::ppc_qpx_qvlpcls) &&
|
|
N->getOperand(1)->getOpcode() == ISD::ADD) {
|
|
SDValue Add = N->getOperand(1);
|
|
|
|
int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ?
|
|
5 /* 32 byte alignment */ : 4 /* 16 byte alignment */;
|
|
|
|
if (DAG.MaskedValueIsZero(Add->getOperand(1),
|
|
APInt::getAllOnesValue(Bits /* alignment */)
|
|
.zext(Add.getScalarValueSizeInBits()))) {
|
|
SDNode *BasePtr = Add->getOperand(0).getNode();
|
|
for (SDNode::use_iterator UI = BasePtr->use_begin(),
|
|
UE = BasePtr->use_end();
|
|
UI != UE; ++UI) {
|
|
if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
|
|
cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) {
|
|
// We've found another LVSL/LVSR, and this address is an aligned
|
|
// multiple of that one. The results will be the same, so use the
|
|
// one we've just found instead.
|
|
|
|
return SDValue(*UI, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (isa<ConstantSDNode>(Add->getOperand(1))) {
|
|
SDNode *BasePtr = Add->getOperand(0).getNode();
|
|
for (SDNode::use_iterator UI = BasePtr->use_begin(),
|
|
UE = BasePtr->use_end(); UI != UE; ++UI) {
|
|
if (UI->getOpcode() == ISD::ADD &&
|
|
isa<ConstantSDNode>(UI->getOperand(1)) &&
|
|
(cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
|
|
cast<ConstantSDNode>(UI->getOperand(1))->getZExtValue()) %
|
|
(1ULL << Bits) == 0) {
|
|
SDNode *OtherAdd = *UI;
|
|
for (SDNode::use_iterator VI = OtherAdd->use_begin(),
|
|
VE = OtherAdd->use_end(); VI != VE; ++VI) {
|
|
if (VI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
|
|
cast<ConstantSDNode>(VI->getOperand(0))->getZExtValue() == IID) {
|
|
return SDValue(*VI, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Combine vmaxsw/h/b(a, a's negation) to abs(a)
|
|
// Expose the vabsduw/h/b opportunity for down stream
|
|
if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
|
|
(IID == Intrinsic::ppc_altivec_vmaxsw ||
|
|
IID == Intrinsic::ppc_altivec_vmaxsh ||
|
|
IID == Intrinsic::ppc_altivec_vmaxsb)) {
|
|
SDValue V1 = N->getOperand(1);
|
|
SDValue V2 = N->getOperand(2);
|
|
if ((V1.getSimpleValueType() == MVT::v4i32 ||
|
|
V1.getSimpleValueType() == MVT::v8i16 ||
|
|
V1.getSimpleValueType() == MVT::v16i8) &&
|
|
V1.getSimpleValueType() == V2.getSimpleValueType()) {
|
|
// (0-a, a)
|
|
if (V1.getOpcode() == ISD::SUB &&
|
|
ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
|
|
V1.getOperand(1) == V2) {
|
|
return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
|
|
}
|
|
// (a, 0-a)
|
|
if (V2.getOpcode() == ISD::SUB &&
|
|
ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
|
|
V2.getOperand(1) == V1) {
|
|
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
|
|
}
|
|
// (x-y, y-x)
|
|
if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
|
|
V1.getOperand(0) == V2.getOperand(1) &&
|
|
V1.getOperand(1) == V2.getOperand(0)) {
|
|
return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
break;
|
|
case ISD::INTRINSIC_W_CHAIN:
|
|
// For little endian, VSX loads require generating lxvd2x/xxswapd.
|
|
// Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
|
|
if (Subtarget.needsSwapsForVSXMemOps()) {
|
|
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
|
|
default:
|
|
break;
|
|
case Intrinsic::ppc_vsx_lxvw4x:
|
|
case Intrinsic::ppc_vsx_lxvd2x:
|
|
return expandVSXLoadForLE(N, DCI);
|
|
}
|
|
}
|
|
break;
|
|
case ISD::INTRINSIC_VOID:
|
|
// For little endian, VSX stores require generating xxswapd/stxvd2x.
|
|
// Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
|
|
if (Subtarget.needsSwapsForVSXMemOps()) {
|
|
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
|
|
default:
|
|
break;
|
|
case Intrinsic::ppc_vsx_stxvw4x:
|
|
case Intrinsic::ppc_vsx_stxvd2x:
|
|
return expandVSXStoreForLE(N, DCI);
|
|
}
|
|
}
|
|
break;
|
|
case ISD::BSWAP:
|
|
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
|
|
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
|
|
N->getOperand(0).hasOneUse() &&
|
|
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
|
|
(Subtarget.hasLDBRX() && Subtarget.isPPC64() &&
|
|
N->getValueType(0) == MVT::i64))) {
|
|
SDValue Load = N->getOperand(0);
|
|
LoadSDNode *LD = cast<LoadSDNode>(Load);
|
|
// Create the byte-swapping load.
|
|
SDValue Ops[] = {
|
|
LD->getChain(), // Chain
|
|
LD->getBasePtr(), // Ptr
|
|
DAG.getValueType(N->getValueType(0)) // VT
|
|
};
|
|
SDValue BSLoad =
|
|
DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
|
|
DAG.getVTList(N->getValueType(0) == MVT::i64 ?
|
|
MVT::i64 : MVT::i32, MVT::Other),
|
|
Ops, LD->getMemoryVT(), LD->getMemOperand());
|
|
|
|
// If this is an i16 load, insert the truncate.
|
|
SDValue ResVal = BSLoad;
|
|
if (N->getValueType(0) == MVT::i16)
|
|
ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
|
|
|
|
// First, combine the bswap away. This makes the value produced by the
|
|
// load dead.
|
|
DCI.CombineTo(N, ResVal);
|
|
|
|
// Next, combine the load away, we give it a bogus result value but a real
|
|
// chain result. The result value is dead because the bswap is dead.
|
|
DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
|
|
|
|
// Return N so it doesn't get rechecked!
|
|
return SDValue(N, 0);
|
|
}
|
|
break;
|
|
case PPCISD::VCMP:
|
|
// If a VCMPo node already exists with exactly the same operands as this
|
|
// node, use its result instead of this node (VCMPo computes both a CR6 and
|
|
// a normal output).
|
|
//
|
|
if (!N->getOperand(0).hasOneUse() &&
|
|
!N->getOperand(1).hasOneUse() &&
|
|
!N->getOperand(2).hasOneUse()) {
|
|
|
|
// Scan all of the users of the LHS, looking for VCMPo's that match.
|
|
SDNode *VCMPoNode = nullptr;
|
|
|
|
SDNode *LHSN = N->getOperand(0).getNode();
|
|
for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
|
|
UI != E; ++UI)
|
|
if (UI->getOpcode() == PPCISD::VCMPo &&
|
|
UI->getOperand(1) == N->getOperand(1) &&
|
|
UI->getOperand(2) == N->getOperand(2) &&
|
|
UI->getOperand(0) == N->getOperand(0)) {
|
|
VCMPoNode = *UI;
|
|
break;
|
|
}
|
|
|
|
// If there is no VCMPo node, or if the flag value has a single use, don't
|
|
// transform this.
|
|
if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
|
|
break;
|
|
|
|
// Look at the (necessarily single) use of the flag value. If it has a
|
|
// chain, this transformation is more complex. Note that multiple things
|
|
// could use the value result, which we should ignore.
|
|
SDNode *FlagUser = nullptr;
|
|
for (SDNode::use_iterator UI = VCMPoNode->use_begin();
|
|
FlagUser == nullptr; ++UI) {
|
|
assert(UI != VCMPoNode->use_end() && "Didn't find user!");
|
|
SDNode *User = *UI;
|
|
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
|
|
if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
|
|
FlagUser = User;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the user is a MFOCRF instruction, we know this is safe.
|
|
// Otherwise we give up for right now.
|
|
if (FlagUser->getOpcode() == PPCISD::MFOCRF)
|
|
return SDValue(VCMPoNode, 0);
|
|
}
|
|
break;
|
|
case ISD::BRCOND: {
|
|
SDValue Cond = N->getOperand(1);
|
|
SDValue Target = N->getOperand(2);
|
|
|
|
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
|
|
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
|
|
Intrinsic::loop_decrement) {
|
|
|
|
// We now need to make the intrinsic dead (it cannot be instruction
|
|
// selected).
|
|
DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
|
|
assert(Cond.getNode()->hasOneUse() &&
|
|
"Counter decrement has more than one use");
|
|
|
|
return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
|
|
N->getOperand(0), Target);
|
|
}
|
|
}
|
|
break;
|
|
case ISD::BR_CC: {
|
|
// If this is a branch on an altivec predicate comparison, lower this so
|
|
// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
|
|
// lowering is done pre-legalize, because the legalizer lowers the predicate
|
|
// compare down to code that is difficult to reassemble.
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
|
|
SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
|
|
|
|
// Sometimes the promoted value of the intrinsic is ANDed by some non-zero
|
|
// value. If so, pass-through the AND to get to the intrinsic.
|
|
if (LHS.getOpcode() == ISD::AND &&
|
|
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
|
|
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
|
|
Intrinsic::loop_decrement &&
|
|
isa<ConstantSDNode>(LHS.getOperand(1)) &&
|
|
!isNullConstant(LHS.getOperand(1)))
|
|
LHS = LHS.getOperand(0);
|
|
|
|
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
|
|
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
|
|
Intrinsic::loop_decrement &&
|
|
isa<ConstantSDNode>(RHS)) {
|
|
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
|
|
"Counter decrement comparison is not EQ or NE");
|
|
|
|
unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
|
|
bool isBDNZ = (CC == ISD::SETEQ && Val) ||
|
|
(CC == ISD::SETNE && !Val);
|
|
|
|
// We now need to make the intrinsic dead (it cannot be instruction
|
|
// selected).
|
|
DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
|
|
assert(LHS.getNode()->hasOneUse() &&
|
|
"Counter decrement has more than one use");
|
|
|
|
return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
|
|
N->getOperand(0), N->getOperand(4));
|
|
}
|
|
|
|
int CompareOpc;
|
|
bool isDot;
|
|
|
|
if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
|
|
isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
|
|
getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
|
|
assert(isDot && "Can't compare against a vector result!");
|
|
|
|
// If this is a comparison against something other than 0/1, then we know
|
|
// that the condition is never/always true.
|
|
unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
|
|
if (Val != 0 && Val != 1) {
|
|
if (CC == ISD::SETEQ) // Cond never true, remove branch.
|
|
return N->getOperand(0);
|
|
// Always !=, turn it into an unconditional branch.
|
|
return DAG.getNode(ISD::BR, dl, MVT::Other,
|
|
N->getOperand(0), N->getOperand(4));
|
|
}
|
|
|
|
bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
|
|
|
|
// Create the PPCISD altivec 'dot' comparison node.
|
|
SDValue Ops[] = {
|
|
LHS.getOperand(2), // LHS of compare
|
|
LHS.getOperand(3), // RHS of compare
|
|
DAG.getConstant(CompareOpc, dl, MVT::i32)
|
|
};
|
|
EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
|
|
SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
|
|
|
|
// Unpack the result based on how the target uses it.
|
|
PPC::Predicate CompOpc;
|
|
switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
|
|
default: // Can't happen, don't crash on invalid number though.
|
|
case 0: // Branch on the value of the EQ bit of CR6.
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
|
|
break;
|
|
case 1: // Branch on the inverted value of the EQ bit of CR6.
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
|
|
break;
|
|
case 2: // Branch on the value of the LT bit of CR6.
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
|
|
break;
|
|
case 3: // Branch on the inverted value of the LT bit of CR6.
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
|
|
break;
|
|
}
|
|
|
|
return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
|
|
DAG.getConstant(CompOpc, dl, MVT::i32),
|
|
DAG.getRegister(PPC::CR6, MVT::i32),
|
|
N->getOperand(4), CompNode.getValue(1));
|
|
}
|
|
break;
|
|
}
|
|
case ISD::BUILD_VECTOR:
|
|
return DAGCombineBuildVector(N, DCI);
|
|
case ISD::ABS:
|
|
return combineABS(N, DCI);
|
|
case ISD::VSELECT:
|
|
return combineVSelect(N, DCI);
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue
|
|
PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
|
|
SelectionDAG &DAG,
|
|
SmallVectorImpl<SDNode *> &Created) const {
|
|
// fold (sdiv X, pow2)
|
|
EVT VT = N->getValueType(0);
|
|
if (VT == MVT::i64 && !Subtarget.isPPC64())
|
|
return SDValue();
|
|
if ((VT != MVT::i32 && VT != MVT::i64) ||
|
|
!(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
|
|
return SDValue();
|
|
|
|
SDLoc DL(N);
|
|
SDValue N0 = N->getOperand(0);
|
|
|
|
bool IsNegPow2 = (-Divisor).isPowerOf2();
|
|
unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros();
|
|
SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
|
|
|
|
SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
|
|
Created.push_back(Op.getNode());
|
|
|
|
if (IsNegPow2) {
|
|
Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
|
|
Created.push_back(Op.getNode());
|
|
}
|
|
|
|
return Op;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Inline Assembly Support
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
|
|
KnownBits &Known,
|
|
const APInt &DemandedElts,
|
|
const SelectionDAG &DAG,
|
|
unsigned Depth) const {
|
|
Known.resetAll();
|
|
switch (Op.getOpcode()) {
|
|
default: break;
|
|
case PPCISD::LBRX: {
|
|
// lhbrx is known to have the top bits cleared out.
|
|
if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
|
|
Known.Zero = 0xFFFF0000;
|
|
break;
|
|
}
|
|
case ISD::INTRINSIC_WO_CHAIN: {
|
|
switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
|
|
default: break;
|
|
case Intrinsic::ppc_altivec_vcmpbfp_p:
|
|
case Intrinsic::ppc_altivec_vcmpeqfp_p:
|
|
case Intrinsic::ppc_altivec_vcmpequb_p:
|
|
case Intrinsic::ppc_altivec_vcmpequh_p:
|
|
case Intrinsic::ppc_altivec_vcmpequw_p:
|
|
case Intrinsic::ppc_altivec_vcmpequd_p:
|
|
case Intrinsic::ppc_altivec_vcmpgefp_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtfp_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtsb_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtsh_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtsw_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtsd_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtub_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtuh_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtuw_p:
|
|
case Intrinsic::ppc_altivec_vcmpgtud_p:
|
|
Known.Zero = ~1U; // All bits but the low one are known to be zero.
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
|
|
switch (Subtarget.getCPUDirective()) {
|
|
default: break;
|
|
case PPC::DIR_970:
|
|
case PPC::DIR_PWR4:
|
|
case PPC::DIR_PWR5:
|
|
case PPC::DIR_PWR5X:
|
|
case PPC::DIR_PWR6:
|
|
case PPC::DIR_PWR6X:
|
|
case PPC::DIR_PWR7:
|
|
case PPC::DIR_PWR8:
|
|
case PPC::DIR_PWR9:
|
|
case PPC::DIR_PWR_FUTURE: {
|
|
if (!ML)
|
|
break;
|
|
|
|
if (!DisableInnermostLoopAlign32) {
|
|
// If the nested loop is an innermost loop, prefer to a 32-byte alignment,
|
|
// so that we can decrease cache misses and branch-prediction misses.
|
|
// Actual alignment of the loop will depend on the hotness check and other
|
|
// logic in alignBlocks.
|
|
if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
|
|
return Align(32);
|
|
}
|
|
|
|
const PPCInstrInfo *TII = Subtarget.getInstrInfo();
|
|
|
|
// For small loops (between 5 and 8 instructions), align to a 32-byte
|
|
// boundary so that the entire loop fits in one instruction-cache line.
|
|
uint64_t LoopSize = 0;
|
|
for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
|
|
for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) {
|
|
LoopSize += TII->getInstSizeInBytes(*J);
|
|
if (LoopSize > 32)
|
|
break;
|
|
}
|
|
|
|
if (LoopSize > 16 && LoopSize <= 32)
|
|
return Align(32);
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
return TargetLowering::getPrefLoopAlignment(ML);
|
|
}
|
|
|
|
/// getConstraintType - Given a constraint, return the type of
|
|
/// constraint it is for this target.
|
|
PPCTargetLowering::ConstraintType
|
|
PPCTargetLowering::getConstraintType(StringRef Constraint) const {
|
|
if (Constraint.size() == 1) {
|
|
switch (Constraint[0]) {
|
|
default: break;
|
|
case 'b':
|
|
case 'r':
|
|
case 'f':
|
|
case 'd':
|
|
case 'v':
|
|
case 'y':
|
|
return C_RegisterClass;
|
|
case 'Z':
|
|
// FIXME: While Z does indicate a memory constraint, it specifically
|
|
// indicates an r+r address (used in conjunction with the 'y' modifier
|
|
// in the replacement string). Currently, we're forcing the base
|
|
// register to be r0 in the asm printer (which is interpreted as zero)
|
|
// and forming the complete address in the second register. This is
|
|
// suboptimal.
|
|
return C_Memory;
|
|
}
|
|
} else if (Constraint == "wc") { // individual CR bits.
|
|
return C_RegisterClass;
|
|
} else if (Constraint == "wa" || Constraint == "wd" ||
|
|
Constraint == "wf" || Constraint == "ws" ||
|
|
Constraint == "wi" || Constraint == "ww") {
|
|
return C_RegisterClass; // VSX registers.
|
|
}
|
|
return TargetLowering::getConstraintType(Constraint);
|
|
}
|
|
|
|
/// Examine constraint type and operand type and determine a weight value.
|
|
/// This object must already have been set up with the operand type
|
|
/// and the current alternative constraint selected.
|
|
TargetLowering::ConstraintWeight
|
|
PPCTargetLowering::getSingleConstraintMatchWeight(
|
|
AsmOperandInfo &info, const char *constraint) const {
|
|
ConstraintWeight weight = CW_Invalid;
|
|
Value *CallOperandVal = info.CallOperandVal;
|
|
// If we don't have a value, we can't do a match,
|
|
// but allow it at the lowest weight.
|
|
if (!CallOperandVal)
|
|
return CW_Default;
|
|
Type *type = CallOperandVal->getType();
|
|
|
|
// Look at the constraint type.
|
|
if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
|
|
return CW_Register; // an individual CR bit.
|
|
else if ((StringRef(constraint) == "wa" ||
|
|
StringRef(constraint) == "wd" ||
|
|
StringRef(constraint) == "wf") &&
|
|
type->isVectorTy())
|
|
return CW_Register;
|
|
else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
|
|
return CW_Register; // just hold 64-bit integers data.
|
|
else if (StringRef(constraint) == "ws" && type->isDoubleTy())
|
|
return CW_Register;
|
|
else if (StringRef(constraint) == "ww" && type->isFloatTy())
|
|
return CW_Register;
|
|
|
|
switch (*constraint) {
|
|
default:
|
|
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
|
|
break;
|
|
case 'b':
|
|
if (type->isIntegerTy())
|
|
weight = CW_Register;
|
|
break;
|
|
case 'f':
|
|
if (type->isFloatTy())
|
|
weight = CW_Register;
|
|
break;
|
|
case 'd':
|
|
if (type->isDoubleTy())
|
|
weight = CW_Register;
|
|
break;
|
|
case 'v':
|
|
if (type->isVectorTy())
|
|
weight = CW_Register;
|
|
break;
|
|
case 'y':
|
|
weight = CW_Register;
|
|
break;
|
|
case 'Z':
|
|
weight = CW_Memory;
|
|
break;
|
|
}
|
|
return weight;
|
|
}
|
|
|
|
std::pair<unsigned, const TargetRegisterClass *>
|
|
PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
|
|
StringRef Constraint,
|
|
MVT VT) const {
|
|
if (Constraint.size() == 1) {
|
|
// GCC RS6000 Constraint Letters
|
|
switch (Constraint[0]) {
|
|
case 'b': // R1-R31
|
|
if (VT == MVT::i64 && Subtarget.isPPC64())
|
|
return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
|
|
return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
|
|
case 'r': // R0-R31
|
|
if (VT == MVT::i64 && Subtarget.isPPC64())
|
|
return std::make_pair(0U, &PPC::G8RCRegClass);
|
|
return std::make_pair(0U, &PPC::GPRCRegClass);
|
|
// 'd' and 'f' constraints are both defined to be "the floating point
|
|
// registers", where one is for 32-bit and the other for 64-bit. We don't
|
|
// really care overly much here so just give them all the same reg classes.
|
|
case 'd':
|
|
case 'f':
|
|
if (Subtarget.hasSPE()) {
|
|
if (VT == MVT::f32 || VT == MVT::i32)
|
|
return std::make_pair(0U, &PPC::GPRCRegClass);
|
|
if (VT == MVT::f64 || VT == MVT::i64)
|
|
return std::make_pair(0U, &PPC::SPERCRegClass);
|
|
} else {
|
|
if (VT == MVT::f32 || VT == MVT::i32)
|
|
return std::make_pair(0U, &PPC::F4RCRegClass);
|
|
if (VT == MVT::f64 || VT == MVT::i64)
|
|
return std::make_pair(0U, &PPC::F8RCRegClass);
|
|
if (VT == MVT::v4f64 && Subtarget.hasQPX())
|
|
return std::make_pair(0U, &PPC::QFRCRegClass);
|
|
if (VT == MVT::v4f32 && Subtarget.hasQPX())
|
|
return std::make_pair(0U, &PPC::QSRCRegClass);
|
|
}
|
|
break;
|
|
case 'v':
|
|
if (VT == MVT::v4f64 && Subtarget.hasQPX())
|
|
return std::make_pair(0U, &PPC::QFRCRegClass);
|
|
if (VT == MVT::v4f32 && Subtarget.hasQPX())
|
|
return std::make_pair(0U, &PPC::QSRCRegClass);
|
|
if (Subtarget.hasAltivec())
|
|
return std::make_pair(0U, &PPC::VRRCRegClass);
|
|
break;
|
|
case 'y': // crrc
|
|
return std::make_pair(0U, &PPC::CRRCRegClass);
|
|
}
|
|
} else if (Constraint == "wc" && Subtarget.useCRBits()) {
|
|
// An individual CR bit.
|
|
return std::make_pair(0U, &PPC::CRBITRCRegClass);
|
|
} else if ((Constraint == "wa" || Constraint == "wd" ||
|
|
Constraint == "wf" || Constraint == "wi") &&
|
|
Subtarget.hasVSX()) {
|
|
return std::make_pair(0U, &PPC::VSRCRegClass);
|
|
} else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
|
|
if (VT == MVT::f32 && Subtarget.hasP8Vector())
|
|
return std::make_pair(0U, &PPC::VSSRCRegClass);
|
|
else
|
|
return std::make_pair(0U, &PPC::VSFRCRegClass);
|
|
}
|
|
|
|
// If we name a VSX register, we can't defer to the base class because it
|
|
// will not recognize the correct register (their names will be VSL{0-31}
|
|
// and V{0-31} so they won't match). So we match them here.
|
|
if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
|
|
int VSNum = atoi(Constraint.data() + 3);
|
|
assert(VSNum >= 0 && VSNum <= 63 &&
|
|
"Attempted to access a vsr out of range");
|
|
if (VSNum < 32)
|
|
return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
|
|
return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
|
|
}
|
|
std::pair<unsigned, const TargetRegisterClass *> R =
|
|
TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
|
|
|
// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
|
|
// (which we call X[0-9]+). If a 64-bit value has been requested, and a
|
|
// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
|
|
// register.
|
|
// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
|
|
// the AsmName field from *RegisterInfo.td, then this would not be necessary.
|
|
if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
|
|
PPC::GPRCRegClass.contains(R.first))
|
|
return std::make_pair(TRI->getMatchingSuperReg(R.first,
|
|
PPC::sub_32, &PPC::G8RCRegClass),
|
|
&PPC::G8RCRegClass);
|
|
|
|
// GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
|
|
if (!R.second && StringRef("{cc}").equals_lower(Constraint)) {
|
|
R.first = PPC::CR0;
|
|
R.second = &PPC::CRRCRegClass;
|
|
}
|
|
|
|
return R;
|
|
}
|
|
|
|
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
|
|
/// vector. If it is invalid, don't add anything to Ops.
|
|
void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
|
|
std::string &Constraint,
|
|
std::vector<SDValue>&Ops,
|
|
SelectionDAG &DAG) const {
|
|
SDValue Result;
|
|
|
|
// Only support length 1 constraints.
|
|
if (Constraint.length() > 1) return;
|
|
|
|
char Letter = Constraint[0];
|
|
switch (Letter) {
|
|
default: break;
|
|
case 'I':
|
|
case 'J':
|
|
case 'K':
|
|
case 'L':
|
|
case 'M':
|
|
case 'N':
|
|
case 'O':
|
|
case 'P': {
|
|
ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
|
|
if (!CST) return; // Must be an immediate to match.
|
|
SDLoc dl(Op);
|
|
int64_t Value = CST->getSExtValue();
|
|
EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
|
|
// numbers are printed as such.
|
|
switch (Letter) {
|
|
default: llvm_unreachable("Unknown constraint letter!");
|
|
case 'I': // "I" is a signed 16-bit constant.
|
|
if (isInt<16>(Value))
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
|
|
if (isShiftedUInt<16, 16>(Value))
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
|
|
if (isShiftedInt<16, 16>(Value))
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
|
|
if (isUInt<16>(Value))
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'M': // "M" is a constant that is greater than 31.
|
|
if (Value > 31)
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'N': // "N" is a positive constant that is an exact power of two.
|
|
if (Value > 0 && isPowerOf2_64(Value))
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'O': // "O" is the constant zero.
|
|
if (Value == 0)
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
|
|
if (isInt<16>(-Value))
|
|
Result = DAG.getTargetConstant(Value, dl, TCVT);
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (Result.getNode()) {
|
|
Ops.push_back(Result);
|
|
return;
|
|
}
|
|
|
|
// Handle standard constraint letters.
|
|
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
|
|
}
|
|
|
|
// isLegalAddressingMode - Return true if the addressing mode represented
|
|
// by AM is legal for this target, for a load/store of the specified type.
|
|
bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
|
|
const AddrMode &AM, Type *Ty,
|
|
unsigned AS, Instruction *I) const {
|
|
// PPC does not allow r+i addressing modes for vectors!
|
|
if (Ty->isVectorTy() && AM.BaseOffs != 0)
|
|
return false;
|
|
|
|
// PPC allows a sign-extended 16-bit immediate field.
|
|
if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
|
|
return false;
|
|
|
|
// No global is ever allowed as a base.
|
|
if (AM.BaseGV)
|
|
return false;
|
|
|
|
// PPC only support r+r,
|
|
switch (AM.Scale) {
|
|
case 0: // "r+i" or just "i", depending on HasBaseReg.
|
|
break;
|
|
case 1:
|
|
if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
|
|
return false;
|
|
// Otherwise we have r+r or r+i.
|
|
break;
|
|
case 2:
|
|
if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
|
|
return false;
|
|
// Allow 2*r as r+r.
|
|
break;
|
|
default:
|
|
// No other scales are supported.
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
MFI.setReturnAddressIsTaken(true);
|
|
|
|
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
|
|
return SDValue();
|
|
|
|
SDLoc dl(Op);
|
|
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
|
|
// Make sure the function does not optimize away the store of the RA to
|
|
// the stack.
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
FuncInfo->setLRStoreRequired();
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
auto PtrVT = getPointerTy(MF.getDataLayout());
|
|
|
|
if (Depth > 0) {
|
|
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
|
|
SDValue Offset =
|
|
DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
|
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
|
|
DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
|
|
MachinePointerInfo());
|
|
}
|
|
|
|
// Just load the return address off the stack.
|
|
SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
|
|
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
|
|
MachinePointerInfo());
|
|
}
|
|
|
|
SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
|
|
SelectionDAG &DAG) const {
|
|
SDLoc dl(Op);
|
|
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
MFI.setFrameAddressIsTaken(true);
|
|
|
|
EVT PtrVT = getPointerTy(MF.getDataLayout());
|
|
bool isPPC64 = PtrVT == MVT::i64;
|
|
|
|
// Naked functions never have a frame pointer, and so we use r1. For all
|
|
// other functions, this decision must be delayed until during PEI.
|
|
unsigned FrameReg;
|
|
if (MF.getFunction().hasFnAttribute(Attribute::Naked))
|
|
FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
|
|
else
|
|
FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
|
|
|
|
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
|
|
PtrVT);
|
|
while (Depth--)
|
|
FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
|
|
FrameAddr, MachinePointerInfo());
|
|
return FrameAddr;
|
|
}
|
|
|
|
// FIXME? Maybe this could be a TableGen attribute on some registers and
|
|
// this table could be generated automatically from RegInfo.
|
|
Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
|
|
const MachineFunction &MF) const {
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
bool IsDarwinABI = Subtarget.isDarwinABI();
|
|
|
|
bool is64Bit = isPPC64 && VT == LLT::scalar(64);
|
|
if (!is64Bit && VT != LLT::scalar(32))
|
|
report_fatal_error("Invalid register global variable type");
|
|
|
|
Register Reg = StringSwitch<Register>(RegName)
|
|
.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
|
|
.Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2)
|
|
.Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :
|
|
(is64Bit ? PPC::X13 : PPC::R13))
|
|
.Default(Register());
|
|
|
|
if (Reg)
|
|
return Reg;
|
|
report_fatal_error("Invalid register name global variable");
|
|
}
|
|
|
|
bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
|
|
// 32-bit SVR4 ABI access everything as got-indirect.
|
|
if (Subtarget.is32BitELFABI())
|
|
return true;
|
|
|
|
// AIX accesses everything indirectly through the TOC, which is similar to
|
|
// the GOT.
|
|
if (Subtarget.isAIXABI())
|
|
return true;
|
|
|
|
CodeModel::Model CModel = getTargetMachine().getCodeModel();
|
|
// If it is small or large code model, module locals are accessed
|
|
// indirectly by loading their address from .toc/.got.
|
|
if (CModel == CodeModel::Small || CModel == CodeModel::Large)
|
|
return true;
|
|
|
|
// JumpTable and BlockAddress are accessed as got-indirect.
|
|
if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
|
|
return true;
|
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
|
|
return Subtarget.isGVIndirectSymbol(G->getGlobal());
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
|
|
// The PowerPC target isn't yet aware of offsets.
|
|
return false;
|
|
}
|
|
|
|
bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
|
const CallInst &I,
|
|
MachineFunction &MF,
|
|
unsigned Intrinsic) const {
|
|
switch (Intrinsic) {
|
|
case Intrinsic::ppc_qpx_qvlfd:
|
|
case Intrinsic::ppc_qpx_qvlfs:
|
|
case Intrinsic::ppc_qpx_qvlfcd:
|
|
case Intrinsic::ppc_qpx_qvlfcs:
|
|
case Intrinsic::ppc_qpx_qvlfiwa:
|
|
case Intrinsic::ppc_qpx_qvlfiwz:
|
|
case Intrinsic::ppc_altivec_lvx:
|
|
case Intrinsic::ppc_altivec_lvxl:
|
|
case Intrinsic::ppc_altivec_lvebx:
|
|
case Intrinsic::ppc_altivec_lvehx:
|
|
case Intrinsic::ppc_altivec_lvewx:
|
|
case Intrinsic::ppc_vsx_lxvd2x:
|
|
case Intrinsic::ppc_vsx_lxvw4x: {
|
|
EVT VT;
|
|
switch (Intrinsic) {
|
|
case Intrinsic::ppc_altivec_lvebx:
|
|
VT = MVT::i8;
|
|
break;
|
|
case Intrinsic::ppc_altivec_lvehx:
|
|
VT = MVT::i16;
|
|
break;
|
|
case Intrinsic::ppc_altivec_lvewx:
|
|
VT = MVT::i32;
|
|
break;
|
|
case Intrinsic::ppc_vsx_lxvd2x:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfd:
|
|
VT = MVT::v4f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfs:
|
|
VT = MVT::v4f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfcd:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfcs:
|
|
VT = MVT::v2f32;
|
|
break;
|
|
default:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
}
|
|
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
Info.memVT = VT;
|
|
Info.ptrVal = I.getArgOperand(0);
|
|
Info.offset = -VT.getStoreSize()+1;
|
|
Info.size = 2*VT.getStoreSize()-1;
|
|
Info.align = Align::None();
|
|
Info.flags = MachineMemOperand::MOLoad;
|
|
return true;
|
|
}
|
|
case Intrinsic::ppc_qpx_qvlfda:
|
|
case Intrinsic::ppc_qpx_qvlfsa:
|
|
case Intrinsic::ppc_qpx_qvlfcda:
|
|
case Intrinsic::ppc_qpx_qvlfcsa:
|
|
case Intrinsic::ppc_qpx_qvlfiwaa:
|
|
case Intrinsic::ppc_qpx_qvlfiwza: {
|
|
EVT VT;
|
|
switch (Intrinsic) {
|
|
case Intrinsic::ppc_qpx_qvlfda:
|
|
VT = MVT::v4f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfsa:
|
|
VT = MVT::v4f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfcda:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvlfcsa:
|
|
VT = MVT::v2f32;
|
|
break;
|
|
default:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
}
|
|
|
|
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
|
Info.memVT = VT;
|
|
Info.ptrVal = I.getArgOperand(0);
|
|
Info.offset = 0;
|
|
Info.size = VT.getStoreSize();
|
|
Info.align = Align::None();
|
|
Info.flags = MachineMemOperand::MOLoad;
|
|
return true;
|
|
}
|
|
case Intrinsic::ppc_qpx_qvstfd:
|
|
case Intrinsic::ppc_qpx_qvstfs:
|
|
case Intrinsic::ppc_qpx_qvstfcd:
|
|
case Intrinsic::ppc_qpx_qvstfcs:
|
|
case Intrinsic::ppc_qpx_qvstfiw:
|
|
case Intrinsic::ppc_altivec_stvx:
|
|
case Intrinsic::ppc_altivec_stvxl:
|
|
case Intrinsic::ppc_altivec_stvebx:
|
|
case Intrinsic::ppc_altivec_stvehx:
|
|
case Intrinsic::ppc_altivec_stvewx:
|
|
case Intrinsic::ppc_vsx_stxvd2x:
|
|
case Intrinsic::ppc_vsx_stxvw4x: {
|
|
EVT VT;
|
|
switch (Intrinsic) {
|
|
case Intrinsic::ppc_altivec_stvebx:
|
|
VT = MVT::i8;
|
|
break;
|
|
case Intrinsic::ppc_altivec_stvehx:
|
|
VT = MVT::i16;
|
|
break;
|
|
case Intrinsic::ppc_altivec_stvewx:
|
|
VT = MVT::i32;
|
|
break;
|
|
case Intrinsic::ppc_vsx_stxvd2x:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfd:
|
|
VT = MVT::v4f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfs:
|
|
VT = MVT::v4f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfcd:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfcs:
|
|
VT = MVT::v2f32;
|
|
break;
|
|
default:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
}
|
|
|
|
Info.opc = ISD::INTRINSIC_VOID;
|
|
Info.memVT = VT;
|
|
Info.ptrVal = I.getArgOperand(1);
|
|
Info.offset = -VT.getStoreSize()+1;
|
|
Info.size = 2*VT.getStoreSize()-1;
|
|
Info.align = Align::None();
|
|
Info.flags = MachineMemOperand::MOStore;
|
|
return true;
|
|
}
|
|
case Intrinsic::ppc_qpx_qvstfda:
|
|
case Intrinsic::ppc_qpx_qvstfsa:
|
|
case Intrinsic::ppc_qpx_qvstfcda:
|
|
case Intrinsic::ppc_qpx_qvstfcsa:
|
|
case Intrinsic::ppc_qpx_qvstfiwa: {
|
|
EVT VT;
|
|
switch (Intrinsic) {
|
|
case Intrinsic::ppc_qpx_qvstfda:
|
|
VT = MVT::v4f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfsa:
|
|
VT = MVT::v4f32;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfcda:
|
|
VT = MVT::v2f64;
|
|
break;
|
|
case Intrinsic::ppc_qpx_qvstfcsa:
|
|
VT = MVT::v2f32;
|
|
break;
|
|
default:
|
|
VT = MVT::v4i32;
|
|
break;
|
|
}
|
|
|
|
Info.opc = ISD::INTRINSIC_VOID;
|
|
Info.memVT = VT;
|
|
Info.ptrVal = I.getArgOperand(1);
|
|
Info.offset = 0;
|
|
Info.size = VT.getStoreSize();
|
|
Info.align = Align::None();
|
|
Info.flags = MachineMemOperand::MOStore;
|
|
return true;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// getOptimalMemOpType - Returns the target specific optimal type for load
|
|
/// and store operations as a result of memset, memcpy, and memmove
|
|
/// lowering. If DstAlign is zero that means it's safe to destination
|
|
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
|
|
/// means there isn't a need to check it against alignment requirement,
|
|
/// probably because the source does not need to be loaded. If 'IsMemset' is
|
|
/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
|
|
/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
|
|
/// source is constant so it does not need to be loaded.
|
|
/// It returns EVT::Other if the type should be determined using generic
|
|
/// target-independent logic.
|
|
EVT PPCTargetLowering::getOptimalMemOpType(
|
|
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
|
|
bool ZeroMemset, bool MemcpyStrSrc,
|
|
const AttributeList &FuncAttributes) const {
|
|
if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
|
|
// When expanding a memset, require at least two QPX instructions to cover
|
|
// the cost of loading the value to be stored from the constant pool.
|
|
if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
|
|
(!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
|
|
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
|
|
return MVT::v4f64;
|
|
}
|
|
|
|
// We should use Altivec/VSX loads and stores when available. For unaligned
|
|
// addresses, unaligned VSX loads are only fast starting with the P8.
|
|
if (Subtarget.hasAltivec() && Size >= 16 &&
|
|
(((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
|
|
((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
|
|
return MVT::v4i32;
|
|
}
|
|
|
|
if (Subtarget.isPPC64()) {
|
|
return MVT::i64;
|
|
}
|
|
|
|
return MVT::i32;
|
|
}
|
|
|
|
/// Returns true if it is beneficial to convert a load of a constant
|
|
/// to just the constant itself.
|
|
bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
|
|
Type *Ty) const {
|
|
assert(Ty->isIntegerTy());
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
return !(BitSize == 0 || BitSize > 64);
|
|
}
|
|
|
|
bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
|
|
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
|
|
return false;
|
|
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
|
|
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
|
|
return NumBits1 == 64 && NumBits2 == 32;
|
|
}
|
|
|
|
bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
|
|
if (!VT1.isInteger() || !VT2.isInteger())
|
|
return false;
|
|
unsigned NumBits1 = VT1.getSizeInBits();
|
|
unsigned NumBits2 = VT2.getSizeInBits();
|
|
return NumBits1 == 64 && NumBits2 == 32;
|
|
}
|
|
|
|
bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
|
|
// Generally speaking, zexts are not free, but they are free when they can be
|
|
// folded with other operations.
|
|
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
|
|
EVT MemVT = LD->getMemoryVT();
|
|
if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
|
|
(Subtarget.isPPC64() && MemVT == MVT::i32)) &&
|
|
(LD->getExtensionType() == ISD::NON_EXTLOAD ||
|
|
LD->getExtensionType() == ISD::ZEXTLOAD))
|
|
return true;
|
|
}
|
|
|
|
// FIXME: Add other cases...
|
|
// - 32-bit shifts with a zext to i64
|
|
// - zext after ctlz, bswap, etc.
|
|
// - zext after and by a constant mask
|
|
|
|
return TargetLowering::isZExtFree(Val, VT2);
|
|
}
|
|
|
|
bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
|
|
assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
|
|
"invalid fpext types");
|
|
// Extending to float128 is not free.
|
|
if (DestVT == MVT::f128)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
|
|
return isInt<16>(Imm) || isUInt<16>(Imm);
|
|
}
|
|
|
|
bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
|
|
return isInt<16>(Imm) || isUInt<16>(Imm);
|
|
}
|
|
|
|
bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
|
|
unsigned,
|
|
unsigned,
|
|
MachineMemOperand::Flags,
|
|
bool *Fast) const {
|
|
if (DisablePPCUnaligned)
|
|
return false;
|
|
|
|
// PowerPC supports unaligned memory access for simple non-vector types.
|
|
// Although accessing unaligned addresses is not as efficient as accessing
|
|
// aligned addresses, it is generally more efficient than manual expansion,
|
|
// and generally only traps for software emulation when crossing page
|
|
// boundaries.
|
|
|
|
if (!VT.isSimple())
|
|
return false;
|
|
|
|
if (VT.isFloatingPoint() && !Subtarget.allowsUnalignedFPAccess())
|
|
return false;
|
|
|
|
if (VT.getSimpleVT().isVector()) {
|
|
if (Subtarget.hasVSX()) {
|
|
if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
|
|
VT != MVT::v4f32 && VT != MVT::v4i32)
|
|
return false;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (VT == MVT::ppcf128)
|
|
return false;
|
|
|
|
if (Fast)
|
|
*Fast = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
|
|
EVT VT) const {
|
|
VT = VT.getScalarType();
|
|
|
|
if (!VT.isSimple())
|
|
return false;
|
|
|
|
switch (VT.getSimpleVT().SimpleTy) {
|
|
case MVT::f32:
|
|
case MVT::f64:
|
|
return true;
|
|
case MVT::f128:
|
|
return (EnableQuadPrecision && Subtarget.hasP9Vector());
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
const MCPhysReg *
|
|
PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
|
|
// LR is a callee-save register, but we must treat it as clobbered by any call
|
|
// site. Hence we include LR in the scratch registers, which are in turn added
|
|
// as implicit-defs for stackmaps and patchpoints. The same reasoning applies
|
|
// to CTR, which is used by any indirect call.
|
|
static const MCPhysReg ScratchRegs[] = {
|
|
PPC::X12, PPC::LR8, PPC::CTR8, 0
|
|
};
|
|
|
|
return ScratchRegs;
|
|
}
|
|
|
|
unsigned PPCTargetLowering::getExceptionPointerRegister(
|
|
const Constant *PersonalityFn) const {
|
|
return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
|
|
}
|
|
|
|
unsigned PPCTargetLowering::getExceptionSelectorRegister(
|
|
const Constant *PersonalityFn) const {
|
|
return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
|
|
}
|
|
|
|
bool
|
|
PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
|
|
EVT VT , unsigned DefinedValues) const {
|
|
if (VT == MVT::v2i64)
|
|
return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
|
|
|
|
if (Subtarget.hasVSX() || Subtarget.hasQPX())
|
|
return true;
|
|
|
|
return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
|
|
}
|
|
|
|
Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
|
|
if (DisableILPPref || Subtarget.enableMachineScheduler())
|
|
return TargetLowering::getSchedulingPreference(N);
|
|
|
|
return Sched::ILP;
|
|
}
|
|
|
|
// Create a fast isel object.
|
|
FastISel *
|
|
PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
|
|
const TargetLibraryInfo *LibInfo) const {
|
|
return PPC::createFastISel(FuncInfo, LibInfo);
|
|
}
|
|
|
|
void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
|
|
if (Subtarget.isDarwinABI()) return;
|
|
if (!Subtarget.isPPC64()) return;
|
|
|
|
// Update IsSplitCSR in PPCFunctionInfo
|
|
PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
|
|
PFI->setIsSplitCSR(true);
|
|
}
|
|
|
|
void PPCTargetLowering::insertCopiesSplitCSR(
|
|
MachineBasicBlock *Entry,
|
|
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
|
|
const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
|
|
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
|
|
if (!IStart)
|
|
return;
|
|
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
|
|
MachineBasicBlock::iterator MBBI = Entry->begin();
|
|
for (const MCPhysReg *I = IStart; *I; ++I) {
|
|
const TargetRegisterClass *RC = nullptr;
|
|
if (PPC::G8RCRegClass.contains(*I))
|
|
RC = &PPC::G8RCRegClass;
|
|
else if (PPC::F8RCRegClass.contains(*I))
|
|
RC = &PPC::F8RCRegClass;
|
|
else if (PPC::CRRCRegClass.contains(*I))
|
|
RC = &PPC::CRRCRegClass;
|
|
else if (PPC::VRRCRegClass.contains(*I))
|
|
RC = &PPC::VRRCRegClass;
|
|
else
|
|
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
|
|
|
|
Register NewVR = MRI->createVirtualRegister(RC);
|
|
// Create copy from CSR to a virtual register.
|
|
// FIXME: this currently does not emit CFI pseudo-instructions, it works
|
|
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
|
|
// nounwind. If we want to generalize this later, we may need to emit
|
|
// CFI pseudo-instructions.
|
|
assert(Entry->getParent()->getFunction().hasFnAttribute(
|
|
Attribute::NoUnwind) &&
|
|
"Function should be nounwind in insertCopiesSplitCSR!");
|
|
Entry->addLiveIn(*I);
|
|
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
|
|
.addReg(*I);
|
|
|
|
// Insert the copy-back instructions right before the terminator.
|
|
for (auto *Exit : Exits)
|
|
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
|
|
TII->get(TargetOpcode::COPY), *I)
|
|
.addReg(NewVR);
|
|
}
|
|
}
|
|
|
|
// Override to enable LOAD_STACK_GUARD lowering on Linux.
|
|
bool PPCTargetLowering::useLoadStackGuardNode() const {
|
|
if (!Subtarget.isTargetLinux())
|
|
return TargetLowering::useLoadStackGuardNode();
|
|
return true;
|
|
}
|
|
|
|
// Override to disable global variable loading on Linux.
|
|
void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
|
|
if (!Subtarget.isTargetLinux())
|
|
return TargetLowering::insertSSPDeclarations(M);
|
|
}
|
|
|
|
bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
|
|
bool ForCodeSize) const {
|
|
if (!VT.isSimple() || !Subtarget.hasVSX())
|
|
return false;
|
|
|
|
switch(VT.getSimpleVT().SimpleTy) {
|
|
default:
|
|
// For FP types that are currently not supported by PPC backend, return
|
|
// false. Examples: f16, f80.
|
|
return false;
|
|
case MVT::f32:
|
|
case MVT::f64:
|
|
case MVT::ppcf128:
|
|
return Imm.isPosZero();
|
|
}
|
|
}
|
|
|
|
// For vector shift operation op, fold
|
|
// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
|
|
static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
|
|
SelectionDAG &DAG) {
|
|
SDValue N0 = N->getOperand(0);
|
|
SDValue N1 = N->getOperand(1);
|
|
EVT VT = N0.getValueType();
|
|
unsigned OpSizeInBits = VT.getScalarSizeInBits();
|
|
unsigned Opcode = N->getOpcode();
|
|
unsigned TargetOpcode;
|
|
|
|
switch (Opcode) {
|
|
default:
|
|
llvm_unreachable("Unexpected shift operation");
|
|
case ISD::SHL:
|
|
TargetOpcode = PPCISD::SHL;
|
|
break;
|
|
case ISD::SRL:
|
|
TargetOpcode = PPCISD::SRL;
|
|
break;
|
|
case ISD::SRA:
|
|
TargetOpcode = PPCISD::SRA;
|
|
break;
|
|
}
|
|
|
|
if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
|
|
N1->getOpcode() == ISD::AND)
|
|
if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
|
|
if (Mask->getZExtValue() == OpSizeInBits - 1)
|
|
return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
|
|
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
|
|
return Value;
|
|
|
|
SDValue N0 = N->getOperand(0);
|
|
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
|
if (!Subtarget.isISA3_0() ||
|
|
N0.getOpcode() != ISD::SIGN_EXTEND ||
|
|
N0.getOperand(0).getValueType() != MVT::i32 ||
|
|
CN1 == nullptr || N->getValueType(0) != MVT::i64)
|
|
return SDValue();
|
|
|
|
// We can't save an operation here if the value is already extended, and
|
|
// the existing shift is easier to combine.
|
|
SDValue ExtsSrc = N0.getOperand(0);
|
|
if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
|
|
ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)
|
|
return SDValue();
|
|
|
|
SDLoc DL(N0);
|
|
SDValue ShiftBy = SDValue(CN1, 0);
|
|
// We want the shift amount to be i32 on the extswli, but the shift could
|
|
// have an i64.
|
|
if (ShiftBy.getValueType() == MVT::i64)
|
|
ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);
|
|
|
|
return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),
|
|
ShiftBy);
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
|
|
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
|
|
return Value;
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
|
|
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
|
|
return Value;
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
|
|
// Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
|
|
// When C is zero, the equation (addi Z, -C) can be simplified to Z
|
|
// Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
|
|
static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
|
|
const PPCSubtarget &Subtarget) {
|
|
if (!Subtarget.isPPC64())
|
|
return SDValue();
|
|
|
|
SDValue LHS = N->getOperand(0);
|
|
SDValue RHS = N->getOperand(1);
|
|
|
|
auto isZextOfCompareWithConstant = [](SDValue Op) {
|
|
if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
|
|
Op.getValueType() != MVT::i64)
|
|
return false;
|
|
|
|
SDValue Cmp = Op.getOperand(0);
|
|
if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
|
|
Cmp.getOperand(0).getValueType() != MVT::i64)
|
|
return false;
|
|
|
|
if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {
|
|
int64_t NegConstant = 0 - Constant->getSExtValue();
|
|
// Due to the limitations of the addi instruction,
|
|
// -C is required to be [-32768, 32767].
|
|
return isInt<16>(NegConstant);
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
|
|
bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
|
|
|
|
// If there is a pattern, canonicalize a zext operand to the RHS.
|
|
if (LHSHasPattern && !RHSHasPattern)
|
|
std::swap(LHS, RHS);
|
|
else if (!LHSHasPattern && !RHSHasPattern)
|
|
return SDValue();
|
|
|
|
SDLoc DL(N);
|
|
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
|
|
SDValue Cmp = RHS.getOperand(0);
|
|
SDValue Z = Cmp.getOperand(0);
|
|
auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
|
|
|
|
assert(Constant && "Constant Should not be a null pointer.");
|
|
int64_t NegConstant = 0 - Constant->getSExtValue();
|
|
|
|
switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
|
|
default: break;
|
|
case ISD::SETNE: {
|
|
// when C == 0
|
|
// --> addze X, (addic Z, -1).carry
|
|
// /
|
|
// add X, (zext(setne Z, C))--
|
|
// \ when -32768 <= -C <= 32767 && C != 0
|
|
// --> addze X, (addic (addi Z, -C), -1).carry
|
|
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
|
|
DAG.getConstant(NegConstant, DL, MVT::i64));
|
|
SDValue AddOrZ = NegConstant != 0 ? Add : Z;
|
|
SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
|
|
AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));
|
|
return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
|
|
SDValue(Addc.getNode(), 1));
|
|
}
|
|
case ISD::SETEQ: {
|
|
// when C == 0
|
|
// --> addze X, (subfic Z, 0).carry
|
|
// /
|
|
// add X, (zext(sete Z, C))--
|
|
// \ when -32768 <= -C <= 32767 && C != 0
|
|
// --> addze X, (subfic (addi Z, -C), 0).carry
|
|
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
|
|
DAG.getConstant(NegConstant, DL, MVT::i64));
|
|
SDValue AddOrZ = NegConstant != 0 ? Add : Z;
|
|
SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
|
|
DAG.getConstant(0, DL, MVT::i64), AddOrZ);
|
|
return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
|
|
SDValue(Subc.getNode(), 1));
|
|
}
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
|
|
if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
|
|
return Value;
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// Detect TRUNCATE operations on bitcasts of float128 values.
|
|
// What we are looking for here is the situtation where we extract a subset
|
|
// of bits from a 128 bit float.
|
|
// This can be of two forms:
|
|
// 1) BITCAST of f128 feeding TRUNCATE
|
|
// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
|
|
// The reason this is required is because we do not have a legal i128 type
|
|
// and so we want to prevent having to store the f128 and then reload part
|
|
// of it.
|
|
SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
// If we are using CRBits then try that first.
|
|
if (Subtarget.useCRBits()) {
|
|
// Check if CRBits did anything and return that if it did.
|
|
if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
|
|
return CRTruncValue;
|
|
}
|
|
|
|
SDLoc dl(N);
|
|
SDValue Op0 = N->getOperand(0);
|
|
|
|
// Looking for a truncate of i128 to i64.
|
|
if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
|
|
return SDValue();
|
|
|
|
int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
|
|
|
|
// SRL feeding TRUNCATE.
|
|
if (Op0.getOpcode() == ISD::SRL) {
|
|
ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
|
|
// The right shift has to be by 64 bits.
|
|
if (!ConstNode || ConstNode->getZExtValue() != 64)
|
|
return SDValue();
|
|
|
|
// Switch the element number to extract.
|
|
EltToExtract = EltToExtract ? 0 : 1;
|
|
// Update Op0 past the SRL.
|
|
Op0 = Op0.getOperand(0);
|
|
}
|
|
|
|
// BITCAST feeding a TRUNCATE possibly via SRL.
|
|
if (Op0.getOpcode() == ISD::BITCAST &&
|
|
Op0.getValueType() == MVT::i128 &&
|
|
Op0.getOperand(0).getValueType() == MVT::f128) {
|
|
SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
|
|
return DCI.DAG.getNode(
|
|
ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
|
|
DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
|
|
}
|
|
return SDValue();
|
|
}
|
|
|
|
SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
|
|
if (!ConstOpOrElement)
|
|
return SDValue();
|
|
|
|
// An imul is usually smaller than the alternative sequence for legal type.
|
|
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
|
|
isOperationLegal(ISD::MUL, N->getValueType(0)))
|
|
return SDValue();
|
|
|
|
auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
|
|
switch (this->Subtarget.getCPUDirective()) {
|
|
default:
|
|
// TODO: enhance the condition for subtarget before pwr8
|
|
return false;
|
|
case PPC::DIR_PWR8:
|
|
// type mul add shl
|
|
// scalar 4 1 1
|
|
// vector 7 2 2
|
|
return true;
|
|
case PPC::DIR_PWR9:
|
|
case PPC::DIR_PWR_FUTURE:
|
|
// type mul add shl
|
|
// scalar 5 2 2
|
|
// vector 7 2 2
|
|
|
|
// The cycle RATIO of related operations are showed as a table above.
|
|
// Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
|
|
// scalar and vector type. For 2 instrs patterns, add/sub + shl
|
|
// are 4, it is always profitable; but for 3 instrs patterns
|
|
// (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
|
|
// So we should only do it for vector type.
|
|
return IsAddOne && IsNeg ? VT.isVector() : true;
|
|
}
|
|
};
|
|
|
|
EVT VT = N->getValueType(0);
|
|
SDLoc DL(N);
|
|
|
|
const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
|
|
bool IsNeg = MulAmt.isNegative();
|
|
APInt MulAmtAbs = MulAmt.abs();
|
|
|
|
if ((MulAmtAbs - 1).isPowerOf2()) {
|
|
// (mul x, 2^N + 1) => (add (shl x, N), x)
|
|
// (mul x, -(2^N + 1)) => -(add (shl x, N), x)
|
|
|
|
if (!IsProfitable(IsNeg, true, VT))
|
|
return SDValue();
|
|
|
|
SDValue Op0 = N->getOperand(0);
|
|
SDValue Op1 =
|
|
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
|
|
DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
|
|
SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
|
|
|
|
if (!IsNeg)
|
|
return Res;
|
|
|
|
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
|
|
} else if ((MulAmtAbs + 1).isPowerOf2()) {
|
|
// (mul x, 2^N - 1) => (sub (shl x, N), x)
|
|
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
|
|
|
|
if (!IsProfitable(IsNeg, false, VT))
|
|
return SDValue();
|
|
|
|
SDValue Op0 = N->getOperand(0);
|
|
SDValue Op1 =
|
|
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
|
|
DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));
|
|
|
|
if (!IsNeg)
|
|
return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
|
|
else
|
|
return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
|
|
|
|
} else {
|
|
return SDValue();
|
|
}
|
|
}
|
|
|
|
bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
|
|
// Only duplicate to increase tail-calls for the 64bit SysV ABIs.
|
|
if (!Subtarget.is64BitELFABI())
|
|
return false;
|
|
|
|
// If not a tail call then no need to proceed.
|
|
if (!CI->isTailCall())
|
|
return false;
|
|
|
|
// If sibling calls have been disabled and tail-calls aren't guaranteed
|
|
// there is no reason to duplicate.
|
|
auto &TM = getTargetMachine();
|
|
if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
|
|
return false;
|
|
|
|
// Can't tail call a function called indirectly, or if it has variadic args.
|
|
const Function *Callee = CI->getCalledFunction();
|
|
if (!Callee || Callee->isVarArg())
|
|
return false;
|
|
|
|
// Make sure the callee and caller calling conventions are eligible for tco.
|
|
const Function *Caller = CI->getParent()->getParent();
|
|
if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
|
|
CI->getCallingConv()))
|
|
return false;
|
|
|
|
// If the function is local then we have a good chance at tail-calling it
|
|
return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
|
|
}
|
|
|
|
bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
|
|
if (!Subtarget.hasVSX())
|
|
return false;
|
|
if (Subtarget.hasP9Vector() && VT == MVT::f128)
|
|
return true;
|
|
return VT == MVT::f32 || VT == MVT::f64 ||
|
|
VT == MVT::v4f32 || VT == MVT::v2f64;
|
|
}
|
|
|
|
bool PPCTargetLowering::
|
|
isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
|
|
const Value *Mask = AndI.getOperand(1);
|
|
// If the mask is suitable for andi. or andis. we should sink the and.
|
|
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
|
|
// Can't handle constants wider than 64-bits.
|
|
if (CI->getBitWidth() > 64)
|
|
return false;
|
|
int64_t ConstVal = CI->getZExtValue();
|
|
return isUInt<16>(ConstVal) ||
|
|
(isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
|
|
}
|
|
|
|
// For non-constant masks, we can always use the record-form and.
|
|
return true;
|
|
}
|
|
|
|
// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
|
|
// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
|
|
// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
|
|
// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
|
|
// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
|
|
SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
|
|
assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
|
|
assert(Subtarget.hasP9Altivec() &&
|
|
"Only combine this when P9 altivec supported!");
|
|
EVT VT = N->getValueType(0);
|
|
if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
|
|
return SDValue();
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
if (N->getOperand(0).getOpcode() == ISD::SUB) {
|
|
// Even for signed integers, if it's known to be positive (as signed
|
|
// integer) due to zero-extended inputs.
|
|
unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
|
|
unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
|
|
if ((SubOpcd0 == ISD::ZERO_EXTEND ||
|
|
SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) &&
|
|
(SubOpcd1 == ISD::ZERO_EXTEND ||
|
|
SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) {
|
|
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
|
|
N->getOperand(0)->getOperand(0),
|
|
N->getOperand(0)->getOperand(1),
|
|
DAG.getTargetConstant(0, dl, MVT::i32));
|
|
}
|
|
|
|
// For type v4i32, it can be optimized with xvnegsp + vabsduw
|
|
if (N->getOperand(0).getValueType() == MVT::v4i32 &&
|
|
N->getOperand(0).hasOneUse()) {
|
|
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
|
|
N->getOperand(0)->getOperand(0),
|
|
N->getOperand(0)->getOperand(1),
|
|
DAG.getTargetConstant(1, dl, MVT::i32));
|
|
}
|
|
}
|
|
|
|
return SDValue();
|
|
}
|
|
|
|
// For type v4i32/v8ii16/v16i8, transform
|
|
// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b)
|
|
// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b)
|
|
// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b)
|
|
// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b)
|
|
SDValue PPCTargetLowering::combineVSelect(SDNode *N,
|
|
DAGCombinerInfo &DCI) const {
|
|
assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here");
|
|
assert(Subtarget.hasP9Altivec() &&
|
|
"Only combine this when P9 altivec supported!");
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
SDLoc dl(N);
|
|
SDValue Cond = N->getOperand(0);
|
|
SDValue TrueOpnd = N->getOperand(1);
|
|
SDValue FalseOpnd = N->getOperand(2);
|
|
EVT VT = N->getOperand(1).getValueType();
|
|
|
|
if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB ||
|
|
FalseOpnd.getOpcode() != ISD::SUB)
|
|
return SDValue();
|
|
|
|
// ABSD only available for type v4i32/v8i16/v16i8
|
|
if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
|
|
return SDValue();
|
|
|
|
// At least to save one more dependent computation
|
|
if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse()))
|
|
return SDValue();
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
|
|
|
|
// Can only handle unsigned comparison here
|
|
switch (CC) {
|
|
default:
|
|
return SDValue();
|
|
case ISD::SETUGT:
|
|
case ISD::SETUGE:
|
|
break;
|
|
case ISD::SETULT:
|
|
case ISD::SETULE:
|
|
std::swap(TrueOpnd, FalseOpnd);
|
|
break;
|
|
}
|
|
|
|
SDValue CmpOpnd1 = Cond.getOperand(0);
|
|
SDValue CmpOpnd2 = Cond.getOperand(1);
|
|
|
|
// SETCC CmpOpnd1 CmpOpnd2 cond
|
|
// TrueOpnd = CmpOpnd1 - CmpOpnd2
|
|
// FalseOpnd = CmpOpnd2 - CmpOpnd1
|
|
if (TrueOpnd.getOperand(0) == CmpOpnd1 &&
|
|
TrueOpnd.getOperand(1) == CmpOpnd2 &&
|
|
FalseOpnd.getOperand(0) == CmpOpnd2 &&
|
|
FalseOpnd.getOperand(1) == CmpOpnd1) {
|
|
return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(),
|
|
CmpOpnd1, CmpOpnd2,
|
|
DAG.getTargetConstant(0, dl, MVT::i32));
|
|
}
|
|
|
|
return SDValue();
|
|
}
|