mirror of
https://github.com/Gericom/teak-llvm.git
synced 2025-06-19 19:45:40 -04:00

In r311255 we added a case where we split vectors whose elements are all derived from the same input vector so that we could shuffle it more efficiently. In doing so, createBuildVecShuffle was taught to adjust for the fact that all indices would be based off of the first vector when this happens, but it's possible for the code that checked that to fire incorrectly if we happen to have a BUILD_VECTOR of extracts from subvectors and don't hit this new optimization. Instead of trying to detect if we've split the vector by checking if we have extracts from the same base vector, we can just pass that information into createBuildVecShuffle, avoiding the miscompile. Differential Revision: https://reviews.llvm.org/D59507 llvm-svn: 356476
49 lines
2.2 KiB
LLVM
49 lines
2.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
|
|
|
|
define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
|
|
; CHECK-LABEL: f:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: movzwl (%rdi), %r8d
|
|
; CHECK-NEXT: movzwl 2(%rdi), %r9d
|
|
; CHECK-NEXT: movzwl 4(%rdi), %r11d
|
|
; CHECK-NEXT: movzwl 6(%rdi), %edi
|
|
; CHECK-NEXT: movzwl (%rsi), %r10d
|
|
; CHECK-NEXT: movzwl 2(%rsi), %ecx
|
|
; CHECK-NEXT: movzwl 4(%rsi), %eax
|
|
; CHECK-NEXT: movzwl 6(%rsi), %esi
|
|
; CHECK-NEXT: movw %si, 14(%rdx)
|
|
; CHECK-NEXT: movw %di, 12(%rdx)
|
|
; CHECK-NEXT: movw %ax, 10(%rdx)
|
|
; CHECK-NEXT: movw %r11w, 8(%rdx)
|
|
; CHECK-NEXT: movw %cx, 6(%rdx)
|
|
; CHECK-NEXT: movw %r9w, 4(%rdx)
|
|
; CHECK-NEXT: movw %r10w, 2(%rdx)
|
|
; CHECK-NEXT: movw %r8w, (%rdx)
|
|
; CHECK-NEXT: retq
|
|
%tmp4 = load <4 x half>, <4 x half>* %a
|
|
%tmp5 = load <4 x half>, <4 x half>* %b
|
|
%tmp7 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 0, i32 4>
|
|
%tmp8 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 1, i32 5>
|
|
%tmp9 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 2, i32 6>
|
|
%tmp10 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 3, i32 7>
|
|
%tmp11 = extractelement <2 x half> %tmp7, i32 0
|
|
%tmp12 = insertelement <8 x half> undef, half %tmp11, i32 0
|
|
%tmp13 = extractelement <2 x half> %tmp7, i32 1
|
|
%tmp14 = insertelement <8 x half> %tmp12, half %tmp13, i32 1
|
|
%tmp15 = extractelement <2 x half> %tmp8, i32 0
|
|
%tmp16 = insertelement <8 x half> %tmp14, half %tmp15, i32 2
|
|
%tmp17 = extractelement <2 x half> %tmp8, i32 1
|
|
%tmp18 = insertelement <8 x half> %tmp16, half %tmp17, i32 3
|
|
%tmp19 = extractelement <2 x half> %tmp9, i32 0
|
|
%tmp20 = insertelement <8 x half> %tmp18, half %tmp19, i32 4
|
|
%tmp21 = extractelement <2 x half> %tmp9, i32 1
|
|
%tmp22 = insertelement <8 x half> %tmp20, half %tmp21, i32 5
|
|
%tmp23 = extractelement <2 x half> %tmp10, i32 0
|
|
%tmp24 = insertelement <8 x half> %tmp22, half %tmp23, i32 6
|
|
%tmp25 = extractelement <2 x half> %tmp10, i32 1
|
|
%tmp26 = insertelement <8 x half> %tmp24, half %tmp25, i32 7
|
|
store <8 x half> %tmp26, <8 x half>* %c
|
|
ret void
|
|
}
|