mirror of
https://github.com/Gericom/teak-llvm.git
synced 2025-06-19 11:35:51 -04:00

This adjusts the way that we lower NEON shifts to use a DAG target node, not via a neon intrinsic. This is useful for handling MVE shifts operations in the same the way. It also renames some of the immediate shift nodes for consistency, and moves some of the processing of immediate shifts into LowerShift allowing it to capture more cases. Differential Revision: https://reviews.llvm.org/D64426 llvm-svn: 366051
544 lines
20 KiB
LLVM
544 lines
20 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -lower-interleaved-accesses=false | FileCheck %s
|
|
|
|
define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
|
|
; CHECK-LABEL: vpaddi8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r1]
|
|
; CHECK-NEXT: vldr d17, [r0]
|
|
; CHECK-NEXT: vpadd.i8 d16, d17, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
|
%tmp2 = load <8 x i8>, <8 x i8>* %B
|
|
%tmp3 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
|
|
ret <8 x i8> %tmp3
|
|
}
|
|
|
|
define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
|
|
; CHECK-LABEL: vpaddi16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r1]
|
|
; CHECK-NEXT: vldr d17, [r0]
|
|
; CHECK-NEXT: vpadd.i16 d16, d17, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
|
%tmp2 = load <4 x i16>, <4 x i16>* %B
|
|
%tmp3 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
|
|
ret <4 x i16> %tmp3
|
|
}
|
|
|
|
define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
|
|
; CHECK-LABEL: vpaddi32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r1]
|
|
; CHECK-NEXT: vldr d17, [r0]
|
|
; CHECK-NEXT: vpadd.i32 d16, d17, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <2 x i32>, <2 x i32>* %A
|
|
%tmp2 = load <2 x i32>, <2 x i32>* %B
|
|
%tmp3 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
|
|
ret <2 x i32> %tmp3
|
|
}
|
|
|
|
define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
|
|
; CHECK-LABEL: vpaddf32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r1]
|
|
; CHECK-NEXT: vldr d17, [r0]
|
|
; CHECK-NEXT: vpadd.f32 d16, d17, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <2 x float>, <2 x float>* %A
|
|
%tmp2 = load <2 x float>, <2 x float>* %B
|
|
%tmp3 = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
|
|
ret <2 x float> %tmp3
|
|
}
|
|
|
|
declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
|
declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
|
declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
|
|
|
|
declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
|
|
|
|
define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddls8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r0]
|
|
; CHECK-NEXT: vpaddl.s8 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
|
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
|
|
ret <4 x i16> %tmp2
|
|
}
|
|
|
|
define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddls16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r0]
|
|
; CHECK-NEXT: vpaddl.s16 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
|
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
|
|
ret <2 x i32> %tmp2
|
|
}
|
|
|
|
define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddls32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r0]
|
|
; CHECK-NEXT: vpaddl.s32 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <2 x i32>, <2 x i32>* %A
|
|
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
|
|
ret <1 x i64> %tmp2
|
|
}
|
|
|
|
define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlu8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r0]
|
|
; CHECK-NEXT: vpaddl.u8 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <8 x i8>, <8 x i8>* %A
|
|
%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
|
|
ret <4 x i16> %tmp2
|
|
}
|
|
|
|
define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlu16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r0]
|
|
; CHECK-NEXT: vpaddl.u16 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <4 x i16>, <4 x i16>* %A
|
|
%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
|
|
ret <2 x i32> %tmp2
|
|
}
|
|
|
|
define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlu32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vldr d16, [r0]
|
|
; CHECK-NEXT: vpaddl.u32 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <2 x i32>, <2 x i32>* %A
|
|
%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
|
|
ret <1 x i64> %tmp2
|
|
}
|
|
|
|
define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlQs8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.s8 q8, q8
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: vmov r2, r3, d17
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <16 x i8>, <16 x i8>* %A
|
|
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
|
|
ret <8 x i16> %tmp2
|
|
}
|
|
|
|
define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlQs16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.s16 q8, q8
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: vmov r2, r3, d17
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <8 x i16>, <8 x i16>* %A
|
|
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
|
|
ret <4 x i32> %tmp2
|
|
}
|
|
|
|
define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlQs32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.s32 q8, q8
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: vmov r2, r3, d17
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <4 x i32>, <4 x i32>* %A
|
|
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
|
|
ret <2 x i64> %tmp2
|
|
}
|
|
|
|
define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlQu8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.u8 q8, q8
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: vmov r2, r3, d17
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <16 x i8>, <16 x i8>* %A
|
|
%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
|
|
ret <8 x i16> %tmp2
|
|
}
|
|
|
|
define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlQu16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.u16 q8, q8
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: vmov r2, r3, d17
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <8 x i16>, <8 x i16>* %A
|
|
%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
|
|
ret <4 x i32> %tmp2
|
|
}
|
|
|
|
define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
|
|
; CHECK-LABEL: vpaddlQu32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.u32 q8, q8
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: vmov r2, r3, d17
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = load <4 x i32>, <4 x i32>* %A
|
|
%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
|
|
ret <2 x i64> %tmp2
|
|
}
|
|
|
|
; Combine vuzp+vadd->vpadd.
|
|
define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADD_i8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpadd.i8 d16, d16, d17
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
|
|
%add = add <8 x i8> %tmp3, %tmp1
|
|
store <8 x i8> %add, <8 x i8>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vadd->vpadd.
|
|
define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADD_i16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpadd.i16 d16, d16, d17
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <8 x i16>, <8 x i16>* %cbcr
|
|
%tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%add = add <4 x i16> %tmp3, %tmp1
|
|
store <4 x i16> %add, <4 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vtrn+vadd->vpadd.
|
|
define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADD_i32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpadd.i32 d16, d16, d17
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <4 x i32>, <4 x i32>* %cbcr
|
|
%tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
|
|
%tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
|
|
%add = add <2 x i32> %tmp3, %tmp1
|
|
store <2 x i32> %add, <2 x i32>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vaddl->vpaddl
|
|
define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_s8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.s8 q8, q8
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
%tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
|
|
%tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
|
|
%add = add <8 x i16> %tmp4, %tmp5
|
|
store <8 x i16> %add, <8 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vaddl->vpaddl
|
|
; FIXME: Legalization butchers the shuffles.
|
|
define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDL_s8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vext.8 d18, d16, d16, #1
|
|
; CHECK-NEXT: vshl.i16 d16, d16, #8
|
|
; CHECK-NEXT: vshl.i16 d18, d18, #8
|
|
; CHECK-NEXT: vshr.s16 d17, d18, #8
|
|
; CHECK-NEXT: vsra.s16 d17, d16, #8
|
|
; CHECK-NEXT: vstr d17, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
|
|
%tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
|
|
%add = add <4 x i16> %tmp4, %tmp5
|
|
store <4 x i16> %add, <4 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vaddl->vpaddl
|
|
define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_u8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.u8 q8, q8
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
%tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
|
|
%tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
|
|
%add = add <8 x i16> %tmp4, %tmp5
|
|
store <8 x i16> %add, <8 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; In theory, it's possible to match this to vpaddl, but rearranging the
|
|
; shuffle is awkward, so this doesn't match at the moment.
|
|
define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vmovl.u8 q9, d17
|
|
; CHECK-NEXT: vmovl.u8 q8, d16
|
|
; CHECK-NEXT: vuzp.16 q8, q9
|
|
; CHECK-NEXT: vadd.i16 q8, q8, q9
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = zext <16 x i8> %tmp to <16 x i16>
|
|
%tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
|
|
%tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
|
|
%add = add <8 x i16> %tmp2, %tmp3
|
|
store <8 x i16> %add, <8 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vaddl->vpaddl
|
|
; FIXME: Legalization butchers the shuffle.
|
|
define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDL_u8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vext.8 d18, d16, d16, #1
|
|
; CHECK-NEXT: vbic.i16 d16, #0xff00
|
|
; CHECK-NEXT: vbic.i16 d18, #0xff00
|
|
; CHECK-NEXT: vadd.i16 d16, d18, d16
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
|
|
%tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
|
|
%add = add <4 x i16> %tmp4, %tmp5
|
|
store <4 x i16> %add, <4 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Matching to vpaddl.8 requires matching shuffle(zext()).
|
|
define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vmovl.u8 q8, d16
|
|
; CHECK-NEXT: vpadd.i16 d16, d16, d17
|
|
; CHECK-NEXT: vstr d16, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <16 x i8>, <16 x i8>* %cbcr
|
|
%tmp1 = zext <16 x i8> %tmp to <16 x i16>
|
|
%tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%add = add <4 x i16> %tmp2, %tmp3
|
|
store <4 x i16> %add, <4 x i16>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vaddl->vpaddl
|
|
define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_s16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.s16 q8, q8
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <8 x i16>, <8 x i16>* %cbcr
|
|
%tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
|
|
%tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
|
|
%add = add <4 x i32> %tmp4, %tmp5
|
|
store <4 x i32> %add, <4 x i32>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vuzp+vaddl->vpaddl
|
|
define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_u16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.u16 q8, q8
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <8 x i16>, <8 x i16>* %cbcr
|
|
%tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
|
|
%tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
|
|
%add = add <4 x i32> %tmp4, %tmp5
|
|
store <4 x i32> %add, <4 x i32>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vtrn+vaddl->vpaddl
|
|
define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_s32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.s32 q8, q8
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <4 x i32>, <4 x i32>* %cbcr
|
|
%tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
|
|
%tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
|
|
%tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
|
|
%tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
|
|
%add = add <2 x i64> %tmp4, %tmp5
|
|
store <2 x i64> %add, <2 x i64>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Combine vtrn+vaddl->vpaddl
|
|
define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
|
|
; CHECK-LABEL: addCombineToVPADDLq_u32:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
|
|
; CHECK-NEXT: vpaddl.u32 q8, q8
|
|
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp = load <4 x i32>, <4 x i32>* %cbcr
|
|
%tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
|
|
%tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
|
|
%tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
|
|
%tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
|
|
%add = add <2 x i64> %tmp4, %tmp5
|
|
store <2 x i64> %add, <2 x i64>* %X, align 8
|
|
ret void
|
|
}
|
|
|
|
; Legalization promotes the <4 x i8> to <4 x i16>.
|
|
define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
|
|
; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmov d16, r0, r1
|
|
; CHECK-NEXT: vpaddl.s8 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
%x = add <4 x i8> %tmp2, %tmp1
|
|
ret <4 x i8> %x
|
|
}
|
|
|
|
; Legalization promotes the <2 x i16> to <2 x i32>.
|
|
define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
|
|
; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmov d16, r0, r1
|
|
; CHECK-NEXT: vpaddl.s16 d16, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
|
|
%tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
|
|
%x = add <2 x i16> %tmp2, %tmp1
|
|
ret <2 x i16> %x
|
|
}
|
|
|
|
; And <2 x i8> to <2 x i32>
|
|
define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) {
|
|
; CHECK-LABEL: fromExtendingExtractVectorElt_2i8:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmov d16, r0, r1
|
|
; CHECK-NEXT: vmov.u8 r1, d16[1]
|
|
; CHECK-NEXT: vmov.u8 r0, d16[0]
|
|
; CHECK-NEXT: vmov.u8 r2, d16[2]
|
|
; CHECK-NEXT: vmov.u8 r3, d16[3]
|
|
; CHECK-NEXT: vmov.32 d17[0], r1
|
|
; CHECK-NEXT: vmov.32 d16[0], r0
|
|
; CHECK-NEXT: vmov.32 d17[1], r3
|
|
; CHECK-NEXT: vmov.32 d16[1], r2
|
|
; CHECK-NEXT: vadd.i32 d16, d17, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 0, i32 2>
|
|
%tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 1, i32 3>
|
|
%x = add <2 x i8> %tmp2, %tmp1
|
|
ret <2 x i8> %x
|
|
}
|
|
|
|
define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) {
|
|
; CHECK-LABEL: fromExtendingExtractVectorElt_2i16:
|
|
; CHECK: @ %bb.0:
|
|
; CHECK-NEXT: vmov d16, r0, r1
|
|
; CHECK-NEXT: vmov.u16 r0, d16[0]
|
|
; CHECK-NEXT: vmov.u16 r1, d16[1]
|
|
; CHECK-NEXT: vmov.u16 r3, d16[3]
|
|
; CHECK-NEXT: vmov.u16 r2, d16[2]
|
|
; CHECK-NEXT: vmov.32 d16[0], r0
|
|
; CHECK-NEXT: vmov.32 d17[0], r1
|
|
; CHECK-NEXT: vmov.32 d16[1], r2
|
|
; CHECK-NEXT: vmov.32 d17[1], r3
|
|
; CHECK-NEXT: vadd.i32 d16, d17, d16
|
|
; CHECK-NEXT: vmov r0, r1, d16
|
|
; CHECK-NEXT: mov pc, lr
|
|
%tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 0, i32 2>
|
|
%tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 1, i32 3>
|
|
%x = add <2 x i16> %tmp2, %tmp1
|
|
ret <2 x i16> %x
|
|
}
|
|
|
|
|
|
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
|
|
declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
|
|
declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
|
|
|
|
declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
|
|
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
|
|
declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
|
|
|
|
declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
|
|
declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
|
|
declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
|
|
|
|
declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
|
|
declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
|
|
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
|