diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 4a1e5edeaa6..305702c8282 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1461,18 +1461,15 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pmultishift_qb_128: - GCCBuiltin<"__builtin_ia32_vpmultishiftqb128_mask">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pmultishift_qb_256: - GCCBuiltin<"__builtin_ia32_vpmultishiftqb256_mask">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_pmultishift_qb_512: - GCCBuiltin<"__builtin_ia32_vpmultishiftqb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, - llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; + def int_x86_avx512_pmultishift_qb_128: + GCCBuiltin<"__builtin_ia32_vpmultishiftqb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_avx512_pmultishift_qb_256: + GCCBuiltin<"__builtin_ia32_vpmultishiftqb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx512_pmultishift_qb_512: + GCCBuiltin<"__builtin_ia32_vpmultishiftqb512">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } // Pack ops. diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index ff003a7addd..a00cab883bc 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -299,6 +299,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.min.p") || // Added in 7.0. 128/256 in 5.0 Name.startswith("avx512.mask.fpclass.p") || // Added in 7.0 Name.startswith("avx512.mask.vpshufbitqmb.") || // Added in 8.0 + Name.startswith("avx512.mask.pmultishift.qb.") || // Added in 8.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -1448,6 +1449,15 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_avx512_dbpsadbw_512; else llvm_unreachable("Unexpected intrinsic"); + } else if (Name.startswith("pmultishift.qb.")) { + if (VecWidth == 128) + IID = Intrinsic::x86_avx512_pmultishift_qb_128; + else if (VecWidth == 256) + IID = Intrinsic::x86_avx512_pmultishift_qb_256; + else if (VecWidth == 512) + IID = Intrinsic::x86_avx512_pmultishift_qb_512; + else + llvm_unreachable("Unexpected intrinsic"); } else return false; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 3a3561badc0..890aa76858d 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -775,12 +775,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), - X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK, - X86ISD::MULTISHIFT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK, - X86ISD::MULTISHIFT, 0), - X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK, - X86ISD::MULTISHIFT, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), @@ -888,6 +882,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_pmul_hr_sw_512, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(avx512_pmulh_w_512, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx512_pmulhu_w_512, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx512_pmultishift_qb_128, INTR_TYPE_2OP, X86ISD::MULTISHIFT, 0), + X86_INTRINSIC_DATA(avx512_pmultishift_qb_256, INTR_TYPE_2OP, X86ISD::MULTISHIFT, 0), + X86_INTRINSIC_DATA(avx512_pmultishift_qb_512, INTR_TYPE_2OP, X86ISD::MULTISHIFT, 0), X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0), diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-fast-isel.ll index 495f612ff42..8eb9a7c3866 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-fast-isel.ll @@ -96,3 +96,67 @@ entry: } declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) + +define <8 x i64> @test_mm512_mask_multishift_epi64_epi8(<8 x i64> %__W, i64 %__M, <8 x i64> %__X, <8 x i64> %__Y) { +; X86-LABEL: test_mm512_mask_multishift_epi64_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kunpckdq %k1, %k0, %k1 +; X86-NEXT: vpmultishiftqb %zmm2, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_multishift_epi64_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovq %rdi, %k1 +; X64-NEXT: vpmultishiftqb %zmm2, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__X to <64 x i8> + %1 = bitcast <8 x i64> %__Y to <64 x i8> + %2 = tail call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %0, <64 x i8> %1) + %3 = bitcast <8 x i64> %__W to <64 x i8> + %4 = bitcast i64 %__M to <64 x i1> + %5 = select <64 x i1> %4, <64 x i8> %2, <64 x i8> %3 + %6 = bitcast <64 x i8> %5 to <8 x i64> + ret <8 x i64> %6 +} + +define <8 x i64> @test_mm512_maskz_multishift_epi64_epi8(i64 %__M, <8 x i64> %__X, <8 x i64> %__Y) { +; X86-LABEL: test_mm512_maskz_multishift_epi64_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kunpckdq %k1, %k0, %k1 +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_multishift_epi64_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovq %rdi, %k1 +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__X to <64 x i8> + %1 = bitcast <8 x i64> %__Y to <64 x i8> + %2 = tail call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %0, <64 x i8> %1) + %3 = bitcast i64 %__M to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer + %5 = bitcast <64 x i8> %4 to <8 x i64> + ret <8 x i64> %5 +} + +define <8 x i64> @test_mm512_multishift_epi64_epi8(<8 x i64> %__X, <8 x i64> %__Y) { +; CHECK-LABEL: test_mm512_multishift_epi64_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = bitcast <8 x i64> %__X to <64 x i8> + %1 = bitcast <8 x i64> %__Y to <64 x i8> + %2 = tail call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %0, <64 x i8> %1) + %3 = bitcast <64 x i8> %2 to <8 x i64> + ret <8 x i64> %3 +} + +declare <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8>, <64 x i8>) diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll index a09cf095aef..8c6b982fe74 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll @@ -32,6 +32,36 @@ define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8 ret <64 x i8> %res4 } +declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: +; X86: # %bb.0: +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] +; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: +; X64: # %bb.0: +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] +; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res3, %res2 + ret <64 x i8> %res4 +} + declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll index ffce664b8d6..23a7e2ac3e9 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll @@ -36,15 +36,13 @@ define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8 ret <64 x i8> %res4 } -declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +declare <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8>, <64 x i8>) define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X86: # %bb.0: ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] ; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] @@ -53,18 +51,22 @@ define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X64: # %bb.0: +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xd9] -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xc1] -; X64-NEXT: vpaddb %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfc,0xc0] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] ; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 + %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 + %4 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + %5 = bitcast i64 %x3 to <64 x i1> + %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer + %7 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + %res3 = add <64 x i8> %3, %6 + %res4 = add <64 x i8> %res3, %7 ret <64 x i8> %res4 } diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-fast-isel.ll index 70848be4a6d..896a30af5f4 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-fast-isel.ll @@ -174,5 +174,123 @@ entry: ret <4 x i64> %6 } +define <2 x i64> @test_mm_mask_multishift_epi64_epi8(<2 x i64> %__W, i16 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) { +; X86-LABEL: test_mm_mask_multishift_epi64_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpmultishiftqb %xmm2, %xmm1, %xmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_multishift_epi64_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpmultishiftqb %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__X to <16 x i8> + %1 = bitcast <2 x i64> %__Y to <16 x i8> + %2 = tail call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %0, <16 x i8> %1) + %3 = bitcast <2 x i64> %__W to <16 x i8> + %4 = bitcast i16 %__M to <16 x i1> + %5 = select <16 x i1> %4, <16 x i8> %2, <16 x i8> %3 + %6 = bitcast <16 x i8> %5 to <2 x i64> + ret <2 x i64> %6 +} + +define <2 x i64> @test_mm_maskz_multishift_epi64_epi8(i16 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) { +; X86-LABEL: test_mm_maskz_multishift_epi64_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_maskz_multishift_epi64_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__X to <16 x i8> + %1 = bitcast <2 x i64> %__Y to <16 x i8> + %2 = tail call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %0, <16 x i8> %1) + %3 = bitcast i16 %__M to <16 x i1> + %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer + %5 = bitcast <16 x i8> %4 to <2 x i64> + ret <2 x i64> %5 +} + +define <2 x i64> @test_mm_multishift_epi64_epi8(<2 x i64> %__X, <2 x i64> %__Y) { +; CHECK-LABEL: test_mm_multishift_epi64_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = bitcast <2 x i64> %__X to <16 x i8> + %1 = bitcast <2 x i64> %__Y to <16 x i8> + %2 = tail call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %0, <16 x i8> %1) + %3 = bitcast <16 x i8> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <4 x i64> @test_mm256_mask_multishift_epi64_epi8(<4 x i64> %__W, i32 %__M, <4 x i64> %__X, <4 x i64> %__Y) { +; X86-LABEL: test_mm256_mask_multishift_epi64_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpmultishiftqb %ymm2, %ymm1, %ymm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_multishift_epi64_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpmultishiftqb %ymm2, %ymm1, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__X to <32 x i8> + %1 = bitcast <4 x i64> %__Y to <32 x i8> + %2 = tail call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %0, <32 x i8> %1) + %3 = bitcast <4 x i64> %__W to <32 x i8> + %4 = bitcast i32 %__M to <32 x i1> + %5 = select <32 x i1> %4, <32 x i8> %2, <32 x i8> %3 + %6 = bitcast <32 x i8> %5 to <4 x i64> + ret <4 x i64> %6 +} + +define <4 x i64> @test_mm256_maskz_multishift_epi64_epi8(i32 %__M, <4 x i64> %__X, <4 x i64> %__Y) { +; X86-LABEL: test_mm256_maskz_multishift_epi64_epi8: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_multishift_epi64_epi8: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__X to <32 x i8> + %1 = bitcast <4 x i64> %__Y to <32 x i8> + %2 = tail call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %0, <32 x i8> %1) #3 + %3 = bitcast i32 %__M to <32 x i1> + %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer + %5 = bitcast <32 x i8> %4 to <4 x i64> + ret <4 x i64> %5 +} + +define <4 x i64> @test_mm256_multishift_epi64_epi8(<4 x i64> %__X, <4 x i64> %__Y) { +; CHECK-LABEL: test_mm256_multishift_epi64_epi8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = bitcast <4 x i64> %__X to <32 x i8> + %1 = bitcast <4 x i64> %__Y to <32 x i8> + %2 = tail call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %0, <32 x i8> %1) + %3 = bitcast <32 x i8> %2 to <4 x i64> + ret <4 x i64> %3 +} + declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>) declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>) +declare <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8>, <16 x i8>) +declare <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8>, <32 x i8>) diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll index 7572e40f0b2..4b1d51da46f 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll @@ -62,6 +62,66 @@ define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8 ret <32 x i8> %res4 } +declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: +; X86: # %bb.0: +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] +; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: +; X64: # %bb.0: +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] +; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: +; X86: # %bb.0: +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] +; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: +; X64: # %bb.0: +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] +; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll index 79f32103ddd..7c03d78c825 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll @@ -70,63 +70,71 @@ define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8 ret <32 x i8> %res4 } -declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +declare <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8>, <16 x i8>) define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X86: # %bb.0: +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xd9] -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfc,0xc0] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] ; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] ; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] ; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 + %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 + %4 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer + %7 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + %res3 = add <16 x i8> %3, %6 + %res4 = add <16 x i8> %res3, %7 ret <16 x i8> %res4 } -declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +declare <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8>, <32 x i8>) define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X86: # %bb.0: +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xd9] -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1] -; X86-NEXT: vpaddb %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfc,0xc0] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] ; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X64: # %bb.0: +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xd9] -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1] -; X64-NEXT: vpaddb %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfc,0xc0] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] ; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 + %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 + %4 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + %5 = bitcast i32 %x3 to <32 x i1> + %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer + %7 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + %res3 = add <32 x i8> %3, %6 + %res4 = add <32 x i8> %res3, %7 ret <32 x i8> %res4 }