From a9cc92c247ce5d0ecc3399e7af6e40a3d59bbf6c Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Fri, 30 Nov 2018 22:55:29 +0000 Subject: [PATCH] AMDGPU: Fix various issues around the VirtReg2Value mapping Summary: The VirtReg2Value mapping is crucial for getting consistently reliable divergence information into the SelectionDAG. This patch fixes a bunch of issues that lead to incorrect divergence info and introduces tight assertions to ensure we don't regress: 1. VirtReg2Value is generated lazily; there were some cases where a lookup was performed before all relevant virtual registers were created, leading to an out-of-sync mapping. Those cases were: - Complex code to lower formal arguments that generated CopyFromReg nodes from live-in registers (fixed by never querying the mapping for live-in registers). - Code that generates CopyToReg for formal arguments that are used outside the entry basic block (fixed by never querying the mapping for Register nodes, which don't need the divergence info anyway). 2. For complex values that are lowered to a sequence of registers, all registers must be reflected in the VirtReg2Value mapping. I am not adding any new tests, since I'm not actually aware of any bugs that these problems are causing with trunk as-is. However, I recently added a test case (in r346423) which fails when D53283 is applied without this change. Also, the new assertions should provide most of the effective test coverage. There is one test change in sdwa-peephole.ll. The underlying issue is that since the divergence info is now correct, the DAGISel will select V_OR_B32 directly instead of S_OR_B32. This leads to an extra COPY which affects the behavior of MachineLICM in a way that ends up with the S_MOV_B32 with the constant in a different basic block than the V_OR_B32, which is presumably what defeats the peephole. Reviewers: alex-t, arsenm, rampitec Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D54340 llvm-svn: 348049 --- .../llvm/CodeGen/FunctionLoweringInfo.h | 1 + .../SelectionDAG/FunctionLoweringInfo.cpp | 13 +++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 63 ++++++++++--------- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 7 ++- 4 files changed, 53 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h index 5fe4f89f34b..7c658515de0 100644 --- a/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h +++ b/llvm/include/llvm/CodeGen/FunctionLoweringInfo.h @@ -246,6 +246,7 @@ public: return 0; unsigned &R = ValueMap[V]; assert(R == 0 && "Already initialized this value register!"); + assert(VirtReg2Value.empty()); return R = CreateRegs(V->getType()); } diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index d3c31911d67..fba728625b0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -579,9 +579,18 @@ FunctionLoweringInfo::getOrCreateSwiftErrorVRegUseAt(const Instruction *I, const const Value * FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) { if (VirtReg2Value.empty()) { + SmallVector ValueVTs; for (auto &P : ValueMap) { - VirtReg2Value[P.second] = P.first; + ValueVTs.clear(); + ComputeValueVTs(*TLI, Fn->getParent()->getDataLayout(), + P.first->getType(), ValueVTs); + unsigned Reg = P.second; + for (EVT VT : ValueVTs) { + unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT); + for (unsigned i = 0, e = NumRegisters; i != e; ++i) + VirtReg2Value[Reg++] = P.first; + } } } - return VirtReg2Value[Vreg]; + return VirtReg2Value.lookup(Vreg); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 9f5198042e4..dbcd1bf0c76 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9303,42 +9303,49 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); } +LLVM_ATTRIBUTE_UNUSED +static bool isCopyFromRegOfInlineAsm(const SDNode *N) { + assert(N->getOpcode() == ISD::CopyFromReg); + do { + // Follow the chain until we find an INLINEASM node. + N = N->getOperand(0).getNode(); + if (N->getOpcode() == ISD::INLINEASM) + return true; + } while (N->getOpcode() == ISD::CopyFromReg); + return false; +} + bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const { switch (N->getOpcode()) { - case ISD::Register: case ISD::CopyFromReg: { - const RegisterSDNode *R = nullptr; - if (N->getOpcode() == ISD::Register) { - R = dyn_cast(N); - } - else { - R = dyn_cast(N->getOperand(1)); - } - if (R) - { - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); - unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) - return TRI.isVGPR(MRI, Reg); + const RegisterSDNode *R = cast(N->getOperand(1)); + const MachineFunction * MF = FLI->MF; + const GCNSubtarget &ST = MF->getSubtarget(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return !TRI.isSGPRReg(MRI, Reg); - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (TRI.isVGPR(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - } - return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + // Any VGPR formal argument is also considered divergent + if (!TRI.isSGPRReg(MRI, Reg)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + return false; } + const Value *V = FLI->getValueFromVirtualReg(Reg); + if (V) + return KDA->isDivergent(V); + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); + return !TRI.isSGPRReg(MRI, Reg); } break; case ISD::LOAD: { diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 3c92e8e5cba..6beac4904a7 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -501,7 +501,12 @@ entry: ; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def: ; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff ; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x10000, +; +; TODO: Why is the constant not peepholed into the v_or_b32_e32? +; +; NOSDWA: s_mov_b32 [[CONST:s[0-9]+]], 0x10000 +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, s0, +; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { bb: br label %bb1