teak-llvm/llvm/test/CodeGen/X86/vector-extend-inreg.ll
Craig Topper 6af1be9664 [X86] Use vmovq for v4i64/v4f64/v8i64/v8f64 vzmovl.
We already use vmovq for v2i64/v2f64 vzmovl. But we were using a
blendpd+xorpd for v4i64/v4f64/v8i64/v8f64 under opt speed. Or
movsd+xorpd under optsize.

I think the blend with 0 or movss/d is only needed for
vXi32 where we don't have an instruction that can move 32
bits from one xmm to another while zeroing upper bits.

movq is no worse than blendpd on any known CPUs.

llvm-svn: 364079
2019-06-21 17:24:21 +00:00

121 lines
5.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64-SSE
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64-AVX
define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) nounwind {
; X32-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
; X32-SSE-NEXT: andl $-128, %esp
; X32-SSE-NEXT: subl $384, %esp # imm = 0x180
; X32-SSE-NEXT: movl 88(%ebp), %ecx
; X32-SSE-NEXT: movdqa 72(%ebp), %xmm0
; X32-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X32-SSE-NEXT: xorps %xmm1, %xmm1
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: movaps %xmm1, (%esp)
; X32-SSE-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
; X32-SSE-NEXT: leal (%ecx,%ecx), %eax
; X32-SSE-NEXT: andl $31, %eax
; X32-SSE-NEXT: movl 128(%esp,%eax,4), %eax
; X32-SSE-NEXT: leal 1(%ecx,%ecx), %ecx
; X32-SSE-NEXT: andl $31, %ecx
; X32-SSE-NEXT: movl (%esp,%ecx,4), %edx
; X32-SSE-NEXT: movl %ebp, %esp
; X32-SSE-NEXT: popl %ebp
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pushq %rbp
; X64-SSE-NEXT: movq %rsp, %rbp
; X64-SSE-NEXT: andq $-128, %rsp
; X64-SSE-NEXT: subq $256, %rsp # imm = 0x100
; X64-SSE-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X64-SSE-NEXT: xorps %xmm0, %xmm0
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps %xmm0, (%rsp)
; X64-SSE-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp)
; X64-SSE-NEXT: andl $15, %edi
; X64-SSE-NEXT: movq (%rsp,%rdi,8), %rax
; X64-SSE-NEXT: movq %rbp, %rsp
; X64-SSE-NEXT: popq %rbp
; X64-SSE-NEXT: retq
;
; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
; X32-AVX-NEXT: andl $-128, %esp
; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
; X32-AVX-NEXT: movl 40(%ebp), %ecx
; X32-AVX-NEXT: vpbroadcastq 32(%ebp), %ymm0
; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, (%esp)
; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: leal (%ecx,%ecx), %eax
; X32-AVX-NEXT: andl $31, %eax
; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax
; X32-AVX-NEXT: leal 1(%ecx,%ecx), %ecx
; X32-AVX-NEXT: andl $31, %ecx
; X32-AVX-NEXT: movl (%esp,%ecx,4), %edx
; X32-AVX-NEXT: movl %ebp, %esp
; X32-AVX-NEXT: popl %ebp
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
;
; X64-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbp
; X64-AVX-NEXT: movq %rsp, %rbp
; X64-AVX-NEXT: andq $-128, %rsp
; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi
; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)
; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: andl $15, %edi
; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax
; X64-AVX-NEXT: movq %rbp, %rsp
; X64-AVX-NEXT: popq %rbp
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
%1 = extractelement <16 x i64> %a0, i32 15
%2 = insertelement <16 x i64> zeroinitializer, i64 %1, i32 4
%3 = extractelement <16 x i64> %2, i32 %a1
ret i64 %3
}