Mercurial > hg > CbC > CbC_gcc
diff gcc/config/i386/sse.md @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
line wrap: on
line diff
--- a/gcc/config/i386/sse.md Thu Oct 25 07:37:49 2018 +0900 +++ b/gcc/config/i386/sse.md Thu Feb 13 11:34:05 2020 +0900 @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005-2018 Free Software Foundation, Inc. +;; Copyright (C) 2005-2020 Free Software Foundation, Inc. ;; ;; This file is part of GCC. ;; @@ -21,6 +21,9 @@ ;; SSE UNSPEC_MOVNT + ;; SSE2 + UNSPEC_MOVDI_TO_SSE + ;; SSE3 UNSPEC_LDDQU @@ -184,6 +187,14 @@ ;; For AVX512BITALG support UNSPEC_VPSHUFBIT + + ;; For VP2INTERSECT support + UNSPEC_VP2INTERSECT + + ;; For AVX512BF16 support + UNSPEC_VCVTNE2PS2BF16 + UNSPEC_VCVTNEPS2BF16 + UNSPEC_VDPBF16PS ]) (define_c_enum "unspecv" [ @@ -271,6 +282,9 @@ (define_mode_iterator VF1 [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF]) +(define_mode_iterator VF1_AVX2 + [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX2") V4SF]) + ;; 128- and 256-bit SF vector modes (define_mode_iterator VF1_128_256 [(V8SF "TARGET_AVX") V4SF]) @@ -584,6 +598,10 @@ (define_mode_attr ssequarterinsnmode [(V16SF "V4SF") (V8DF "V2DF") (V16SI "TI") (V8DI "TI")]) +(define_mode_attr vecmemsuffix + [(V16SF "{z}") (V8SF "{y}") (V4SF "{x}") + (V8DF "{z}") (V4DF "{y}") (V2DF "{x}")]) + (define_mode_attr ssedoublemodelower [(V16QI "v16hi") (V32QI "v32hi") (V64QI "v64hi") (V8HI "v8si") (V16HI "v16si") (V32HI "v32si") @@ -593,12 +611,13 @@ [(V4SF "V8SF") (V8SF "V16SF") (V16SF "V32SF") (V2DF "V4DF") (V4DF "V8DF") (V8DF "V16DF") (V16QI "V16HI") (V32QI "V32HI") (V64QI "V64HI") - (V4HI "V4SI") (V8HI "V8SI") (V16HI "V16SI") (V32HI "V32SI") + (V8HI "V8SI") (V16HI "V16SI") (V32HI "V32SI") (V4SI "V4DI") (V8SI "V16SI") (V16SI "V32SI") (V4DI "V8DI") (V8DI "V16DI")]) (define_mode_attr ssebytemode - [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI")]) + [(V8DI "V64QI") (V4DI "V32QI") (V2DI "V16QI") + (V16SI "V64QI") (V8SI "V32QI") (V4SI "V16QI")]) ;; All 128bit vector integer modes (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI]) @@ -722,6 +741,15 @@ (V16SF "hi") (V8SF "qi") (V4SF "qi") (V8DF "qi") (V4DF "qi") (V2DF "qi")]) +;; Mapping of vector modes to corresponding mask half size +(define_mode_attr avx512fmaskhalfmode + [(V64QI "SI") (V32QI "HI") (V16QI "QI") + (V32HI "HI") (V16HI "QI") (V8HI "QI") (V4HI "QI") + (V16SI "QI") (V8SI "QI") (V4SI "QI") + (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V16SF "QI") (V8SF "QI") (V4SF "QI") + (V8DF "QI") (V4DF "QI") (V2DF "QI")]) + ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode [(V16SF "V16SI") (V8DF "V8DI") @@ -825,6 +853,13 @@ (V4SF "k") (V2DF "q") (SF "k") (DF "q")]) +;; Mapping of vector modes to VPTERNLOG suffix +(define_mode_attr ternlogsuffix + [(V8DI "q") (V4DI "q") (V2DI "q") + (V16SI "d") (V8SI "d") (V4SI "d") + (V32HI "d") (V16HI "d") (V8HI "d") + (V64QI "d") (V32QI "d") (V16QI "d")]) + ;; Number of scalar elements in each vector type (define_mode_attr ssescalarnum [(V64QI "64") (V16SI "16") (V8DI "8") @@ -1081,16 +1116,17 @@ (cond [(and (eq_attr "alternative" "1") (match_test "TARGET_AVX512VL")) (const_string "<sseinsnmode>") - (and (match_test "<MODE_SIZE> == 16") - (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (and (eq_attr "alternative" "3") - (match_test "TARGET_SSE_TYPELESS_STORES")))) - (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<sseinsnmode>") (ior (not (match_test "TARGET_SSE2")) (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") + (and (match_test "<MODE>mode == V2DFmode") + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) + (const_string "V4SF") + (and (eq_attr "alternative" "3") + (match_test "TARGET_SSE_TYPELESS_STORES")) + (const_string "V4SF") (and (eq_attr "alternative" "0") (match_test "TARGET_SSE_LOAD0_BY_PXOR")) (const_string "TI") @@ -1147,6 +1183,67 @@ (set_attr "memory" "none,load") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "avx512f_mov<ssescalarmodelower>_mask" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (match_operand:VF_128 2 "register_operand" "v") + (match_operand:VF_128 3 "nonimm_or_0_operand" "0C") + (match_operand:QI 4 "register_operand" "Yk")) + (match_operand:VF_128 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512F" + "vmov<ssescalarmodesuffix>\t{%2, %1, %0%{%4%}%N3|%0%{%4%}%N3, %1, %2}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<ssescalarmode>")]) + +(define_expand "avx512f_load<mode>_mask" + [(set (match_operand:<ssevecmode> 0 "register_operand") + (vec_merge:<ssevecmode> + (vec_merge:<ssevecmode> + (vec_duplicate:<ssevecmode> + (match_operand:MODEF 1 "memory_operand")) + (match_operand:<ssevecmode> 2 "nonimm_or_0_operand") + (match_operand:QI 3 "register_operand")) + (match_dup 4) + (const_int 1)))] + "TARGET_AVX512F" + "operands[4] = CONST0_RTX (<ssevecmode>mode);") + +(define_insn "*avx512f_load<mode>_mask" + [(set (match_operand:<ssevecmode> 0 "register_operand" "=v") + (vec_merge:<ssevecmode> + (vec_merge:<ssevecmode> + (vec_duplicate:<ssevecmode> + (match_operand:MODEF 1 "memory_operand" "m")) + (match_operand:<ssevecmode> 2 "nonimm_or_0_operand" "0C") + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:<ssevecmode> 4 "const0_operand" "C") + (const_int 1)))] + "TARGET_AVX512F" + "vmov<ssescalarmodesuffix>\t{%1, %0%{%3%}%N2|%0%{3%}%N2, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "memory" "load") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_store<mode>_mask" + [(set (match_operand:MODEF 0 "memory_operand" "=m") + (if_then_else:MODEF + (and:QI (match_operand:QI 2 "register_operand" "Yk") + (const_int 1)) + (vec_select:MODEF + (match_operand:<ssevecmode> 1 "register_operand" "v") + (parallel [(const_int 0)])) + (match_dup 0)))] + "TARGET_AVX512F" + "vmov<ssescalarmodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "memory" "store") + (set_attr "mode" "<MODE>")]) + (define_insn "<avx512>_blendm<mode>" [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v") (vec_merge:V48_AVX512VL @@ -1235,10 +1332,10 @@ ;; from there. (define_insn_and_split "movdi_to_sse" - [(parallel - [(set (match_operand:V4SI 0 "register_operand" "=?x,x") - (subreg:V4SI (match_operand:DI 1 "nonimmediate_operand" "r,m") 0)) - (clobber (match_scratch:V4SI 2 "=&x,X"))])] + [(set (match_operand:V4SI 0 "register_operand" "=x,x,?x") + (unspec:V4SI [(match_operand:DI 1 "nonimmediate_operand" "r,m,r")] + UNSPEC_MOVDI_TO_SSE)) + (clobber (match_scratch:V4SI 2 "=X,X,&x"))] "!TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC" "#" "&& reload_completed" @@ -1250,18 +1347,26 @@ Assemble the 64-bit DImode value in an xmm register. */ emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode), gen_lowpart (SImode, operands[1]))); - emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), - gen_highpart (SImode, operands[1]))); - emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0], - operands[2])); - } + if (TARGET_SSE4_1) + emit_insn (gen_sse4_1_pinsrd (operands[0], operands[0], + gen_highpart (SImode, operands[1]), + GEN_INT (2))); + else + { + emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), + gen_highpart (SImode, operands[1]))); + emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0], + operands[2])); + } + } else if (memory_operand (operands[1], DImode)) emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]), operands[1], const0_rtx)); else gcc_unreachable (); DONE; -}) +} + [(set_attr "isa" "sse4,*,*")]) (define_split [(set (match_operand:V4SF 0 "register_operand") @@ -1626,41 +1731,58 @@ "TARGET_SSE" "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;") -(define_insn_and_split "*absneg<mode>2" +(define_insn_and_split "*<code><mode>2" [(set (match_operand:VF 0 "register_operand" "=x,x,v,v") - (match_operator:VF 3 "absneg_operator" - [(match_operand:VF 1 "vector_operand" "0, xBm,v, m")])) + (absneg:VF + (match_operand:VF 1 "vector_operand" "0, xBm,v, m"))) (use (match_operand:VF 2 "vector_operand" "xBm,0, vm,v"))] "TARGET_SSE" "#" "&& reload_completed" - [(const_int 0)] -{ - enum rtx_code absneg_op; - rtx op1, op2; - rtx t; + [(set (match_dup 0) (match_dup 3))] +{ + enum rtx_code absneg_op = <CODE> == ABS ? AND : XOR; if (TARGET_AVX) { if (MEM_P (operands[1])) - op1 = operands[2], op2 = operands[1]; - else - op1 = operands[1], op2 = operands[2]; - } - else - { - op1 = operands[0]; - if (rtx_equal_p (operands[0], operands[1])) - op2 = operands[2]; - else - op2 = operands[1]; - } - - absneg_op = GET_CODE (operands[3]) == NEG ? XOR : AND; - t = gen_rtx_fmt_ee (absneg_op, <MODE>mode, op1, op2); - t = gen_rtx_SET (operands[0], t); - emit_insn (t); - DONE; + std::swap (operands[1], operands[2]); + } + else + { + if (operands_match_p (operands[0], operands[2])) + std::swap (operands[1], operands[2]); + } + + operands[3] + = gen_rtx_fmt_ee (absneg_op, <MODE>mode, operands[1], operands[2]); +} + [(set_attr "isa" "noavx,noavx,avx,avx")]) + +(define_insn_and_split "*nabs<mode>2" + [(set (match_operand:VF 0 "register_operand" "=x,x,v,v") + (neg:VF + (abs:VF + (match_operand:VF 1 "vector_operand" "0,xBm,v,m")))) + (use (match_operand:VF 2 "vector_operand" "xBm,0,vm,v"))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) (match_dup 3))] +{ + if (TARGET_AVX) + { + if (MEM_P (operands[1])) + std::swap (operands[1], operands[2]); + } + else + { + if (operands_match_p (operands[0], operands[2])) + std::swap (operands[1], operands[2]); + } + + operands[3] + = gen_rtx_fmt_ee (IOR, <MODE>mode, operands[1], operands[2]); } [(set_attr "isa" "noavx,noavx,avx,avx")]) @@ -1715,12 +1837,34 @@ (set_attr "type" "sseadd") (set_attr "mode" "<MODE>")]) +;; Standard scalar operation patterns which preserve the rest of the +;; vector for combiner. +(define_insn "*<sse>_vm<plusminus_insn><mode>3" + [(set (match_operand:VF_128 0 "register_operand" "=x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (plusminus:<ssescalarmode> + (vec_select:<ssescalarmode> + (match_operand:VF_128 1 "register_operand" "0,v") + (parallel [(const_int 0)])) + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm,vm"))) + (match_dup 1) + (const_int 1)))] + "TARGET_SSE" + "@ + <plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %2} + v<plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseadd") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "<ssescalarmode>")]) + (define_insn "<sse>_vm<plusminus_insn><mode>3<mask_scalar_name><round_scalar_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (plusminus:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "vector_operand" "xBm,<round_scalar_constraint>")) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_scalar_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_SSE" @@ -1769,12 +1913,35 @@ (set_attr "type" "ssemul") (set_attr "mode" "<MODE>")]) +;; Standard scalar operation patterns which preserve the rest of the +;; vector for combiner. +(define_insn "*<sse>_vm<multdiv_mnemonic><mode>3" + [(set (match_operand:VF_128 0 "register_operand" "=x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (multdiv:<ssescalarmode> + (vec_select:<ssescalarmode> + (match_operand:VF_128 1 "register_operand" "0,v") + (parallel [(const_int 0)])) + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm,vm"))) + (match_dup 1) + (const_int 1)))] + "TARGET_SSE" + "@ + <multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %2} + v<multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse<multdiv_mnemonic>") + (set_attr "prefix" "orig,vex") + (set_attr "btver2_decode" "direct,double") + (set_attr "mode" "<ssescalarmode>")]) + (define_insn "<sse>_vm<multdiv_mnemonic><mode>3<mask_scalar_name><round_scalar_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (multdiv:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "vector_operand" "xBm,<round_scalar_constraint>")) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_scalar_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_SSE" @@ -1869,6 +2036,25 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "*sse_vmrcpv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm,xm")] + UNSPEC_RCP)) + (match_operand:V4SF 2 "register_operand" "0,x") + (const_int 1)))] + "TARGET_SSE" + "@ + rcpss\t{%1, %0|%0, %1} + vrcpss\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") + (set_attr "btver2_sse_attr" "rcp") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "SF")]) + (define_insn "<mask_codefor>rcp14<mode><mask_name>" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (unspec:VF_AVX512VL @@ -1950,7 +2136,7 @@ [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (sqrt:VF_128 - (match_operand:VF_128 1 "vector_operand" "xBm,<round_scalar_constraint>")) + (match_operand:VF_128 1 "nonimmediate_operand" "xm,<round_scalar_constraint>")) (match_operand:VF_128 2 "register_operand" "0,v") (const_int 1)))] "TARGET_SSE" @@ -1964,11 +2150,30 @@ (set_attr "btver2_sse_attr" "sqrt") (set_attr "mode" "<ssescalarmode>")]) +(define_insn "*<sse>_vmsqrt<mode>2<mask_scalar_name><round_scalar_name>" + [(set (match_operand:VF_128 0 "register_operand" "=x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (sqrt:<ssescalarmode> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "xm,<round_scalar_constraint>"))) + (match_operand:VF_128 2 "register_operand" "0,v") + (const_int 1)))] + "TARGET_SSE" + "@ + sqrt<ssescalarmodesuffix>\t{%1, %0|%0, %1} + vsqrt<ssescalarmodesuffix>\t{<round_scalar_mask_op3>%1, %2, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %2, %1<round_scalar_mask_op3>}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") + (set_attr "prefix" "<round_scalar_prefix>") + (set_attr "btver2_sse_attr" "sqrt") + (set_attr "mode" "<ssescalarmode>")]) + (define_expand "rsqrt<mode>2" [(set (match_operand:VF1_128_256 0 "register_operand") (unspec:VF1_128_256 [(match_operand:VF1_128_256 1 "vector_operand")] UNSPEC_RSQRT))] - "TARGET_SSE_MATH" + "TARGET_SSE && TARGET_SSE_MATH" { ix86_emit_swsqrtsf (operands[0], operands[1], <MODE>mode, true); DONE; @@ -1979,7 +2184,7 @@ (unspec:V16SF [(match_operand:V16SF 1 "vector_operand")] UNSPEC_RSQRT28))] - "TARGET_SSE_MATH && TARGET_AVX512ER" + "TARGET_AVX512ER && TARGET_SSE_MATH" { ix86_emit_swsqrtsf (operands[0], operands[1], V16SFmode, true); DONE; @@ -2053,6 +2258,23 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) +(define_insn "*sse_vmrsqrtv4sf2" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") + (vec_merge:V4SF + (vec_duplicate:V4SF + (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm,xm")] + UNSPEC_RSQRT)) + (match_operand:V4SF 2 "register_operand" "0,x") + (const_int 1)))] + "TARGET_SSE" + "@ + rsqrtss\t{%1, %0|%0, %1} + vrsqrtss\t{%1, %2, %0|%0, %2, %1}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "SF")]) + (define_expand "<code><mode>3<mask_name><round_saeonly_name>" [(set (match_operand:VF 0 "register_operand") (smaxmin:VF @@ -2118,12 +2340,36 @@ (set_attr "prefix" "<mask_prefix3>") (set_attr "mode" "<MODE>")]) +;; Standard scalar operation patterns which preserve the rest of the +;; vector for combiner. +(define_insn "*ieee_<ieee_maxmin><mode>3" + [(set (match_operand:VF_128 0 "register_operand" "=x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (unspec:<ssescalarmode> + [(vec_select:<ssescalarmode> + (match_operand:VF_128 1 "register_operand" "0,v") + (parallel [(const_int 0)])) + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm,vm")] + IEEE_MAXMIN)) + (match_dup 1) + (const_int 1)))] + "TARGET_SSE" + "@ + <ieee_maxmin><ssescalarmodesuffix>\t{%2, %0|%0, %2} + v<ieee_maxmin><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sseadd") + (set_attr "btver2_sse_attr" "maxmin") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "<ssescalarmode>")]) + (define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>" [(set (match_operand:VF_128 0 "register_operand" "=x,v") (vec_merge:VF_128 (smaxmin:VF_128 (match_operand:VF_128 1 "register_operand" "0,v") - (match_operand:VF_128 2 "vector_operand" "xBm,<round_saeonly_scalar_constraint>")) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_saeonly_scalar_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_SSE" @@ -2538,9 +2784,30 @@ DONE; }) +(define_expand "reduc_plus_scal_v16qi" + [(plus:V16QI + (match_operand:QI 0 "register_operand") + (match_operand:V16QI 1 "register_operand"))] + "TARGET_SSE2" +{ + rtx tmp = gen_reg_rtx (V1TImode); + emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, operands[1]), + GEN_INT (64))); + rtx tmp2 = gen_reg_rtx (V16QImode); + emit_insn (gen_addv16qi3 (tmp2, operands[1], gen_lowpart (V16QImode, tmp))); + rtx tmp3 = gen_reg_rtx (V16QImode); + emit_move_insn (tmp3, CONST0_RTX (V16QImode)); + rtx tmp4 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_psadbw (tmp4, tmp2, tmp3)); + tmp4 = gen_lowpart (V16QImode, tmp4); + emit_insn (gen_vec_extractv16qiqi (operands[0], tmp4, const0_rtx)); + DONE; +}) + (define_mode_iterator REDUC_PLUS_MODE [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX") - (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")]) + (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F") + (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")]) (define_expand "reduc_plus_scal_<mode>" [(plus:REDUC_PLUS_MODE @@ -2551,8 +2818,8 @@ rtx tmp = gen_reg_rtx (<ssehalfvecmode>mode); emit_insn (gen_vec_extract_hi_<mode> (tmp, operands[1])); rtx tmp2 = gen_reg_rtx (<ssehalfvecmode>mode); - emit_insn (gen_add<ssehalfvecmodelower>3 - (tmp2, tmp, gen_lowpart (<ssehalfvecmode>mode, operands[1]))); + rtx tmp3 = gen_lowpart (<ssehalfvecmode>mode, operands[1]); + emit_insn (gen_add<ssehalfvecmodelower>3 (tmp2, tmp, tmp3)); emit_insn (gen_reduc_plus_scal_<ssehalfvecmodelower> (operands[0], tmp2)); DONE; }) @@ -2560,7 +2827,7 @@ ;; Modes handled by reduc_sm{in,ax}* patterns. (define_mode_iterator REDUC_SSE_SMINMAX_MODE [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE") - (V2DI "TARGET_SSE") (V4SI "TARGET_SSE") (V8HI "TARGET_SSE") + (V2DI "TARGET_SSE4_2") (V4SI "TARGET_SSE") (V8HI "TARGET_SSE") (V16QI "TARGET_SSE")]) (define_expand "reduc_<code>_scal_<mode>" @@ -2746,7 +3013,7 @@ (vec_merge:VF_128 (match_operator:VF_128 3 "sse_comparison_operator" [(match_operand:VF_128 1 "register_operand" "0,x") - (match_operand:VF_128 2 "vector_operand" "xBm,xm")]) + (match_operand:VF_128 2 "nonimmediate_operand" "xm,xm")]) (match_dup 1) (const_int 1)))] "TARGET_SSE" @@ -2771,7 +3038,7 @@ (V8HI "const_0_to_7_operand") (V16QI "const_0_to_7_operand")]) (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:V48_AVX512VL 1 "register_operand" "v") (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "<round_saeonly_constraint>") @@ -2784,8 +3051,20 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (match_operator:<avx512fmaskmode> 3 "ix86_comparison_int_operator" + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "<round_saeonly_constraint>")]))] + "TARGET_AVX512F && <round_saeonly_mode512bit_condition>" + "vpcmp<ssemodesuffix>\t{%I3, <round_saeonly_mask_scalar_merge_op4>%2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2<round_saeonly_mask_scalar_merge_op4>, %I3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<avx512>_cmp<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI12_AVX512VL 1 "register_operand" "v") (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") @@ -2798,8 +3077,20 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_cmp<mode>3<mask_scalar_merge_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (match_operator:<avx512fmaskmode> 3 "ix86_comparison_int_operator" + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]))] + "TARGET_AVX512BW" + "vpcmp<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI12_AVX512VL 1 "register_operand" "v") (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") @@ -2812,8 +3103,20 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_ucmp<mode>3<mask_scalar_merge_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (match_operator:<avx512fmaskmode> 3 "ix86_comparison_uns_operator" + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")]))] + "TARGET_AVX512BW" + "vpcmpu<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "<avx512>_ucmp<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_AVX512VL 1 "register_operand" "v") (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") @@ -2826,12 +3129,24 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_ucmp<mode>3<mask_scalar_merge_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") + (match_operator:<avx512fmaskmode> 3 "ix86_comparison_uns_operator" + [(match_operand:VI48_AVX512VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")]))] + "TARGET_AVX512F" + "vpcmpu<ssemodesuffix>\t{%I3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %I3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> (unspec:<avx512fmaskmode> [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + (match_operand:VF_128 2 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP) (const_int 1)))] @@ -2843,11 +3158,11 @@ (set_attr "mode" "<ssescalarmode>")]) (define_insn "avx512f_vmcmp<mode>3_mask<round_saeonly_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> (unspec:<avx512fmaskmode> [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + (match_operand:VF_128 2 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>") (match_operand:SI 3 "const_0_to_31_operand" "n")] UNSPEC_PCMP) (and:<avx512fmaskmode> @@ -2861,10 +3176,10 @@ (set_attr "mode" "<ssescalarmode>")]) (define_insn "avx512f_maskcmp<mode>3" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (match_operator:<avx512fmaskmode> 3 "sse_comparison_operator" - [(match_operand:VF 1 "register_operand" "v") - (match_operand:VF 2 "nonimmediate_operand" "vm")]))] + [(match_operand:VF_AVX512VL 1 "register_operand" "v") + (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "vm")]))] "TARGET_AVX512F" "vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "ssecmp") @@ -3115,13 +3430,19 @@ (match_operand:<avx512fmaskmode> 3 "register_operand")))] "TARGET_AVX512BW") +;; As vcondv4div4df and vcondv8siv8sf are enabled already with TARGET_AVX, +;; and their condition can be folded late into a constant, we need to +;; support vcond_mask_v4div4di and vcond_mask_v8siv8si for TARGET_AVX. +(define_mode_iterator VI_256_AVX2 [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") + V8SI V4DI]) + (define_expand "vcond_mask_<mode><sseintvecmodelower>" - [(set (match_operand:VI_256 0 "register_operand") - (vec_merge:VI_256 - (match_operand:VI_256 1 "nonimmediate_operand") - (match_operand:VI_256 2 "nonimm_or_0_operand") + [(set (match_operand:VI_256_AVX2 0 "register_operand") + (vec_merge:VI_256_AVX2 + (match_operand:VI_256_AVX2 1 "nonimmediate_operand") + (match_operand:VI_256_AVX2 2 "nonimm_or_0_operand") (match_operand:<sseintvecmode> 3 "register_operand")))] - "TARGET_AVX2" + "TARGET_AVX" { ix86_expand_sse_movcc (operands[0], operands[3], operands[1], operands[2]); @@ -3194,7 +3515,7 @@ (match_operand:VF_128_256 2 "vector_operand" "xBm,xm,vm,vm")))] "TARGET_SSE && <mask_avx512vl_condition>" { - static char buf[128]; + char buf[128]; const char *ops; const char *suffix; @@ -3229,7 +3550,8 @@ } snprintf (buf, sizeof (buf), ops, suffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx512dq,avx512f") (set_attr "type" "sselog") @@ -3241,16 +3563,14 @@ (const_string "<sseintvecmode2>") (eq_attr "alternative" "3") (const_string "<sseintvecmode2>") - (and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<MODE>") (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") - ] - (const_string "<MODE>")))]) - + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "<MODE>")))]) (define_insn "<sse>_andnot<mode>3<mask_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") @@ -3260,7 +3580,7 @@ (match_operand:VF_512 2 "nonimmediate_operand" "vm")))] "TARGET_AVX512F" { - static char buf[128]; + char buf[128]; const char *ops; const char *suffix; @@ -3277,7 +3597,8 @@ snprintf (buf, sizeof (buf), "v%sandn%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}", ops, suffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "type" "sselog") (set_attr "prefix" "evex") @@ -3310,7 +3631,7 @@ "TARGET_SSE && <mask_avx512vl_condition> && !(MEM_P (operands[1]) && MEM_P (operands[2]))" { - static char buf[128]; + char buf[128]; const char *ops; const char *suffix; @@ -3345,7 +3666,8 @@ } snprintf (buf, sizeof (buf), ops, suffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx512dq,avx512f") (set_attr "type" "sselog") @@ -3357,15 +3679,14 @@ (const_string "<sseintvecmode2>") (eq_attr "alternative" "3") (const_string "<sseintvecmode2>") - (and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "<ssePSmode>") (match_test "TARGET_AVX") (const_string "<MODE>") (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") - ] - (const_string "<MODE>")))]) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "<MODE>")))]) (define_insn "*<code><mode>3<mask_name>" [(set (match_operand:VF_512 0 "register_operand" "=v") @@ -3374,7 +3695,7 @@ (match_operand:VF_512 2 "nonimmediate_operand" "vm")))] "TARGET_AVX512F && !(MEM_P (operands[1]) && MEM_P (operands[2]))" { - static char buf[128]; + char buf[128]; const char *ops; const char *suffix; @@ -3391,7 +3712,8 @@ snprintf (buf, sizeof (buf), "v%s<logic>%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}", ops, suffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "type" "sselog") (set_attr "prefix" "evex") @@ -3418,6 +3740,29 @@ operands[5] = gen_reg_rtx (<MODE>mode); }) +(define_expand "xorsign<mode>3" + [(set (match_dup 4) + (and:VF (match_dup 3) + (match_operand:VF 2 "vector_operand"))) + (set (match_operand:VF 0 "register_operand") + (xor:VF (match_dup 4) + (match_operand:VF 1 "vector_operand")))] + "TARGET_SSE" +{ + operands[3] = ix86_build_signbit_mask (<MODE>mode, 1, 0); + + operands[4] = gen_reg_rtx (<MODE>mode); +}) + +(define_expand "signbit<mode>2" + [(set (match_operand:<sseintvecmode> 0 "register_operand") + (lshiftrt:<sseintvecmode> + (subreg:<sseintvecmode> + (match_operand:VF1_AVX2 1 "register_operand") 0) + (match_dup 2)))] + "TARGET_SSE2" + "operands[2] = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode)-1);") + ;; Also define scalar versions. These are used for abs, neg, and ;; conditional move. Using subregs into vector modes causes register ;; allocation lossage. These patterns do not allow memory operands @@ -3431,7 +3776,7 @@ (match_operand:MODEF 2 "register_operand" "x,x,v,v")))] "SSE_FLOAT_MODE_P (<MODE>mode)" { - static char buf[128]; + char buf[128]; const char *ops; const char *suffix = (get_attr_mode (insn) == MODE_V4SF) ? "ps" : "<ssevecmodesuffix>"; @@ -3467,7 +3812,8 @@ } snprintf (buf, sizeof (buf), ops, suffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx512vl,avx512f") (set_attr "type" "sselog") @@ -3481,15 +3827,14 @@ (if_then_else (match_test "TARGET_AVX512DQ") (const_string "<avx512fvecmode>") (const_string "XI")) - (and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "V4SF") (match_test "TARGET_AVX") (const_string "<ssevecmode>") (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") - ] - (const_string "<ssevecmode>")))]) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "<ssevecmode>")))]) (define_insn "*andnottf3" [(set (match_operand:TF 0 "register_operand" "=x,x,v,v") @@ -3498,7 +3843,7 @@ (match_operand:TF 2 "vector_operand" "xBm,xm,vm,v")))] "TARGET_SSE" { - static char buf[128]; + char buf[128]; const char *ops; const char *tmp = (which_alternative >= 2 ? "pandnq" @@ -3521,7 +3866,8 @@ } snprintf (buf, sizeof (buf), ops, tmp); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx512vl,avx512f") (set_attr "type" "sselog") @@ -3537,15 +3883,15 @@ (const_string "TI") (eq_attr "alternative" "3") (const_string "XI") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (const_string "V4SF") (match_test "TARGET_AVX") (const_string "TI") (ior (not (match_test "TARGET_SSE2")) (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") - ] - (const_string "TI")))]) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "TI")))]) (define_insn "*<code><mode>3" [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v") @@ -3554,7 +3900,7 @@ (match_operand:MODEF 2 "register_operand" "x,x,v,v")))] "SSE_FLOAT_MODE_P (<MODE>mode)" { - static char buf[128]; + char buf[128]; const char *ops; const char *suffix = (get_attr_mode (insn) == MODE_V4SF) ? "ps" : "<ssevecmodesuffix>"; @@ -3589,7 +3935,8 @@ } snprintf (buf, sizeof (buf), ops, suffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx512vl,avx512f") (set_attr "type" "sselog") @@ -3603,15 +3950,14 @@ (if_then_else (match_test "TARGET_AVX512DQ") (const_string "<avx512fvecmode>") (const_string "XI")) - (and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "V4SF") (match_test "TARGET_AVX") (const_string "<ssevecmode>") (match_test "optimize_function_for_size_p (cfun)") (const_string "V4SF") - ] - (const_string "<ssevecmode>")))]) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "<ssevecmode>")))]) (define_expand "<code>tf3" [(set (match_operand:TF 0 "register_operand") @@ -3628,7 +3974,7 @@ (match_operand:TF 2 "vector_operand" "xBm,xm,vm,v")))] "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2]))" { - static char buf[128]; + char buf[128]; const char *ops; const char *tmp = (which_alternative >= 2 ? "p<logic>q" @@ -3651,7 +3997,8 @@ } snprintf (buf, sizeof (buf), ops, tmp); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx512vl,avx512f") (set_attr "type" "sselog") @@ -3667,15 +4014,15 @@ (const_string "TI") (eq_attr "alternative" "3") (const_string "QI") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") - (const_string "V4SF") (match_test "TARGET_AVX") (const_string "TI") (ior (not (match_test "TARGET_SSE2")) (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") - ] - (const_string "TI")))]) + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "TI")))]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -3827,12 +4174,12 @@ (set_attr "mode" "<MODE>")]) (define_insn "*<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") + [(set (match_operand:VF_AVX512 0 "register_operand" "=v") (fma:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v") - (match_operand:VF_AVX512 2 "register_operand" "v,0") + (match_operand:VF_AVX512 1 "register_operand" "%0") + (match_operand:VF_AVX512 2 "register_operand" "v") (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m,m"))))] + (match_operand:<ssescalarmode> 3 "memory_operand" "m"))))] "TARGET_AVX512F && <sd_mask_mode512bit_condition>" "vfmadd213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" [(set_attr "type" "ssemuladd") @@ -3871,8 +4218,8 @@ (vec_merge:VF_AVX512VL (fma:VF_AVX512VL (match_operand:VF_AVX512VL 1 "register_operand" "0,0") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v") - (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>")) + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))] "TARGET_AVX512F && <round_mode512bit_condition>" @@ -3886,8 +4233,8 @@ [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (vec_merge:VF_AVX512VL (fma:VF_AVX512VL - (match_operand:VF_AVX512VL 1 "register_operand" "v") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (match_operand:VF_AVX512VL 3 "register_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] @@ -3944,13 +4291,13 @@ (set_attr "mode" "<MODE>")]) (define_insn "*<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") + [(set (match_operand:VF_AVX512 0 "register_operand" "=v") (fma:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v") - (match_operand:VF_AVX512 2 "register_operand" "v,0") + (match_operand:VF_AVX512 1 "register_operand" "%0") + (match_operand:VF_AVX512 2 "register_operand" "v") (neg:VF_AVX512 (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m,m")))))] + (match_operand:<ssescalarmode> 3 "memory_operand" "m")))))] "TARGET_AVX512F && <sd_mask_mode512bit_condition>" "vfmsub213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" [(set_attr "type" "ssemuladd") @@ -3991,9 +4338,9 @@ (vec_merge:VF_AVX512VL (fma:VF_AVX512VL (match_operand:VF_AVX512VL 1 "register_operand" "0,0") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>"))) + (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))] "TARGET_AVX512F" @@ -4007,8 +4354,8 @@ [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (vec_merge:VF_AVX512VL (fma:VF_AVX512VL - (match_operand:VF_AVX512VL 1 "register_operand" "v") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VF_AVX512VL (match_operand:VF_AVX512VL 3 "register_operand" "0"))) (match_dup 3) @@ -4066,13 +4413,13 @@ (set_attr "mode" "<MODE>")]) (define_insn "*<sd_mask_codefor>fma_fnmadd_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") + [(set (match_operand:VF_AVX512 0 "register_operand" "=v") (fma:VF_AVX512 (neg:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v")) - (match_operand:VF_AVX512 2 "register_operand" "v,0") + (match_operand:VF_AVX512 1 "register_operand" "%0")) + (match_operand:VF_AVX512 2 "register_operand" "v") (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m,m"))))] + (match_operand:<ssescalarmode> 3 "memory_operand" "m"))))] "TARGET_AVX512F && <sd_mask_mode512bit_condition>" "vfnmadd213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" [(set_attr "type" "ssemuladd") @@ -4114,8 +4461,8 @@ (fma:VF_AVX512VL (neg:VF_AVX512VL (match_operand:VF_AVX512VL 1 "register_operand" "0,0")) - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v") - (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>")) + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))] "TARGET_AVX512F && <round_mode512bit_condition>" @@ -4130,8 +4477,8 @@ (vec_merge:VF_AVX512VL (fma:VF_AVX512VL (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 1 "register_operand" "v")) - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v")) + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (match_operand:VF_AVX512VL 3 "register_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] @@ -4190,14 +4537,14 @@ (set_attr "mode" "<MODE>")]) (define_insn "*<sd_mask_codefor>fma_fnmsub_<mode><sd_maskz_name>_bcst_1" - [(set (match_operand:VF_AVX512 0 "register_operand" "=v,v") + [(set (match_operand:VF_AVX512 0 "register_operand" "=v") (fma:VF_AVX512 (neg:VF_AVX512 - (match_operand:VF_AVX512 1 "register_operand" "0,v")) - (match_operand:VF_AVX512 2 "register_operand" "v,0") + (match_operand:VF_AVX512 1 "register_operand" "%0")) + (match_operand:VF_AVX512 2 "register_operand" "v") (neg:VF_AVX512 (vec_duplicate:VF_AVX512 - (match_operand:<ssescalarmode> 3 "memory_operand" "m,m")))))] + (match_operand:<ssescalarmode> 3 "memory_operand" "m")))))] "TARGET_AVX512F && <sd_mask_mode512bit_condition>" "vfnmsub213<ssemodesuffix>\t{%3<avx512bcst>, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<avx512bcst>}" [(set_attr "type" "ssemuladd") @@ -4241,9 +4588,9 @@ (fma:VF_AVX512VL (neg:VF_AVX512VL (match_operand:VF_AVX512VL 1 "register_operand" "0,0")) - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>"))) + (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))] "TARGET_AVX512F && <round_mode512bit_condition>" @@ -4258,8 +4605,8 @@ (vec_merge:VF_AVX512VL (fma:VF_AVX512VL (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 1 "register_operand" "v")) - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512VL 1 "<round_nimm_predicate>" "%v")) + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VF_AVX512VL (match_operand:VF_AVX512VL 3 "register_operand" "0"))) (match_dup 3) @@ -4341,8 +4688,8 @@ (vec_merge:VF_AVX512VL (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "register_operand" "0,0") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v") - (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>")] + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")] UNSPEC_FMADDSUB) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))] @@ -4358,7 +4705,7 @@ (vec_merge:VF_AVX512VL (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "register_operand" "v") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (match_operand:VF_AVX512VL 3 "register_operand" "0")] UNSPEC_FMADDSUB) (match_dup 3) @@ -4408,9 +4755,9 @@ (vec_merge:VF_AVX512VL (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "register_operand" "0,0") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>,v") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VF_AVX512VL - (match_operand:VF_AVX512VL 3 "nonimmediate_operand" "v,<round_constraint>"))] + (match_operand:VF_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))] UNSPEC_FMADDSUB) (match_dup 1) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))] @@ -4426,7 +4773,7 @@ (vec_merge:VF_AVX512VL (unspec:VF_AVX512VL [(match_operand:VF_AVX512VL 1 "register_operand" "v") - (match_operand:VF_AVX512VL 2 "nonimmediate_operand" "<round_constraint>") + (match_operand:VF_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VF_AVX512VL (match_operand:VF_AVX512VL 3 "register_operand" "0"))] UNSPEC_FMADDSUB) @@ -4444,9 +4791,9 @@ [(set (match_operand:VF_128 0 "register_operand") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "<round_nimm_predicate>") - (match_operand:VF_128 2 "<round_nimm_predicate>") - (match_operand:VF_128 3 "<round_nimm_predicate>")) + (match_operand:VF_128 1 "register_operand") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>")) (match_dup 1) (const_int 1)))] "TARGET_FMA") @@ -4455,10 +4802,10 @@ [(set (match_operand:VF_128 0 "register_operand") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "<round_nimm_predicate>") - (match_operand:VF_128 2 "<round_nimm_predicate>") + (match_operand:VF_128 1 "register_operand") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>") (neg:VF_128 - (match_operand:VF_128 3 "<round_nimm_predicate>"))) + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>"))) (match_dup 1) (const_int 1)))] "TARGET_FMA") @@ -4468,9 +4815,9 @@ (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 2 "<round_nimm_predicate>")) - (match_operand:VF_128 1 "<round_nimm_predicate>") - (match_operand:VF_128 3 "<round_nimm_predicate>")) + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>")) + (match_operand:VF_128 1 "register_operand") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>")) (match_dup 1) (const_int 1)))] "TARGET_FMA") @@ -4480,10 +4827,10 @@ (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 2 "<round_nimm_predicate>")) - (match_operand:VF_128 1 "<round_nimm_predicate>") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>")) + (match_operand:VF_128 1 "register_operand") (neg:VF_128 - (match_operand:VF_128 3 "<round_nimm_predicate>"))) + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>"))) (match_dup 1) (const_int 1)))] "TARGET_FMA") @@ -4492,9 +4839,9 @@ [(set (match_operand:VF_128 0 "register_operand" "=v,v") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "<round_nimm_predicate>" " 0, 0") - (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v") - (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>")) + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>, v") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" @@ -4508,10 +4855,10 @@ [(set (match_operand:VF_128 0 "register_operand" "=v,v") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "<round_nimm_predicate>" "0,0") - (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v") + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (neg:VF_128 - (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>"))) + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" @@ -4526,9 +4873,9 @@ (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v")) - (match_operand:VF_128 1 "<round_nimm_predicate>" "0,0") - (match_operand:VF_128 3 "<round_nimm_predicate>" "v,<round_constraint>")) + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" @@ -4543,10 +4890,10 @@ (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v")) - (match_operand:VF_128 1 "<round_nimm_predicate>" " 0, 0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "register_operand" "0,0") (neg:VF_128 - (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>"))) + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) (match_dup 1) (const_int 1)))] "TARGET_FMA || TARGET_AVX512F" @@ -4556,6 +4903,252 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "<MODE>")]) +(define_insn "avx512f_vmfmadd_<mode>_mask<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) + (match_dup 1) + (match_operand:QI 4 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfmadd132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>} + vfmadd213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_vmfmadd_<mode>_mask3<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "<round_nimm_scalar_predicate>" "%v") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>") + (match_operand:VF_128 3 "register_operand" "0")) + (match_dup 3) + (match_operand:QI 4 "register_operand" "Yk")) + (match_dup 3) + (const_int 1)))] + "TARGET_AVX512F" + "vfmadd231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_expand "avx512f_vmfmadd_<mode>_maskz<round_expand_name>" + [(match_operand:VF_128 0 "register_operand") + (match_operand:VF_128 1 "<round_expand_nimm_predicate>") + (match_operand:VF_128 2 "<round_expand_nimm_predicate>") + (match_operand:VF_128 3 "<round_expand_nimm_predicate>") + (match_operand:QI 4 "register_operand")] + "TARGET_AVX512F" +{ + emit_insn (gen_avx512f_vmfmadd_<mode>_maskz_1<round_expand_name> ( + operands[0], operands[1], operands[2], operands[3], + CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>)); + DONE; +}) + +(define_insn "avx512f_vmfmadd_<mode>_maskz_1<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) + (match_operand:VF_128 4 "const0_operand" "C,C") + (match_operand:QI 5 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfmadd132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>} + vfmadd213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfmsub_<mode>_mask<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") + (neg:VF_128 + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) + (match_dup 1) + (match_operand:QI 4 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfmsub132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>} + vfmsub213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "avx512f_vmfmsub_<mode>_mask3<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "<round_nimm_scalar_predicate>" "%v") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>") + (neg:VF_128 + (match_operand:VF_128 3 "register_operand" "0"))) + (match_dup 3) + (match_operand:QI 4 "register_operand" "Yk")) + (match_dup 3) + (const_int 1)))] + "TARGET_AVX512F" + "vfmsub231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfmsub_<mode>_maskz_1<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") + (neg:VF_128 + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) + (match_operand:VF_128 4 "const0_operand" "C,C") + (match_operand:QI 5 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfmsub132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>} + vfmsub213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfnmadd_<mode>_mask<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (neg:VF_128 + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) + (match_dup 1) + (match_operand:QI 4 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfnmadd132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>} + vfnmadd213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfnmadd_<mode>_mask3<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (neg:VF_128 + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>")) + (match_operand:VF_128 1 "<round_nimm_scalar_predicate>" "%v") + (match_operand:VF_128 3 "register_operand" "0")) + (match_dup 3) + (match_operand:QI 4 "register_operand" "Yk")) + (match_dup 3) + (const_int 1)))] + "TARGET_AVX512F" + "vfnmadd231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfnmadd_<mode>_maskz_1<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (neg:VF_128 + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "register_operand" "0,0") + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) + (match_operand:VF_128 4 "const0_operand" "C,C") + (match_operand:QI 5 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfnmadd132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>} + vfnmadd213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfnmsub_<mode>_mask<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (neg:VF_128 + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "register_operand" "0,0") + (neg:VF_128 + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) + (match_dup 1) + (match_operand:QI 4 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfnmsub132<ssescalarmodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>} + vfnmsub213<ssescalarmodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %<iptr>2, %<iptr>3<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfnmsub_<mode>_mask3<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (neg:VF_128 + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>")) + (match_operand:VF_128 1 "<round_nimm_scalar_predicate>" "%v") + (neg:VF_128 + (match_operand:VF_128 3 "register_operand" "0"))) + (match_dup 3) + (match_operand:QI 4 "register_operand" "Yk")) + (match_dup 3) + (const_int 1)))] + "TARGET_AVX512F" + "vfnmsub231<ssescalarmodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %<iptr>3, %<iptr>2<round_op5>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_vmfnmsub_<mode>_maskz_1<round_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v,v") + (vec_merge:VF_128 + (vec_merge:VF_128 + (fma:VF_128 + (neg:VF_128 + (match_operand:VF_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) + (match_operand:VF_128 1 "register_operand" "0,0") + (neg:VF_128 + (match_operand:VF_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) + (match_operand:VF_128 4 "const0_operand" "C,C") + (match_operand:QI 5 "register_operand" "Yk,Yk")) + (match_dup 1) + (const_int 1)))] + "TARGET_AVX512F" + "@ + vfnmsub132<ssescalarmodesuffix>\t{<round_op6>%2, %3, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>3, %<iptr>2<round_op6>} + vfnmsub213<ssescalarmodesuffix>\t{<round_op6>%3, %2, %0%{%5%}%{z%}|%0%{%5%}%{z%}, %<iptr>2, %<iptr>3<round_op6>}" + [(set_attr "type" "ssemuladd") + (set_attr "mode" "<MODE>")]) + ;; FMA4 floating point scalar intrinsics. These write the ;; entire destination register, with the high-order elements zeroed. @@ -4637,39 +5230,93 @@ ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(define_insn "sse_cvtpi2ps" - [(set (match_operand:V4SF 0 "register_operand" "=x") +(define_insn_and_split "sse_cvtpi2ps" + [(set (match_operand:V4SF 0 "register_operand" "=x,x,Yv") (vec_merge:V4SF (vec_duplicate:V4SF - (float:V2SF (match_operand:V2SI 2 "nonimmediate_operand" "ym"))) - (match_operand:V4SF 1 "register_operand" "0") - (const_int 3)))] - "TARGET_SSE" - "cvtpi2ps\t{%2, %0|%0, %2}" - [(set_attr "type" "ssecvt") + (float:V2SF (match_operand:V2SI 2 "register_mmxmem_operand" "ym,x,Yv"))) + (match_operand:V4SF 1 "register_operand" "0,0,Yv") + (const_int 3))) + (clobber (match_scratch:V4SF 3 "=X,x,Yv"))] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE" + "@ + cvtpi2ps\t{%2, %0|%0, %2} + # + #" + "TARGET_SSE2 && reload_completed + && SSE_REG_P (operands[2])" + [(const_int 0)] +{ + rtx op2 = lowpart_subreg (V4SImode, operands[2], + GET_MODE (operands[2])); + /* Generate SSE2 cvtdq2ps. */ + emit_insn (gen_floatv4siv4sf2 (operands[3], op2)); + + /* Merge operands[3] with operands[0]. */ + rtx mask, op1; + if (TARGET_AVX) + { + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (0), GEN_INT (1), + GEN_INT (6), GEN_INT (7))); + op1 = gen_rtx_VEC_CONCAT (V8SFmode, operands[3], operands[1]); + op2 = gen_rtx_VEC_SELECT (V4SFmode, op1, mask); + emit_insn (gen_rtx_SET (operands[0], op2)); + } + else + { + /* NB: SSE can only concatenate OP0 and OP3 to OP0. */ + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (4), GEN_INT (5))); + op1 = gen_rtx_VEC_CONCAT (V8SFmode, operands[0], operands[3]); + op2 = gen_rtx_VEC_SELECT (V4SFmode, op1, mask); + emit_insn (gen_rtx_SET (operands[0], op2)); + + /* Swap bits 0:63 with bits 64:127. */ + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (0), GEN_INT (1))); + rtx dest = lowpart_subreg (V4SImode, operands[0], + GET_MODE (operands[0])); + op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); + emit_insn (gen_rtx_SET (dest, op1)); + } + DONE; +} + [(set_attr "mmx_isa" "native,sse_noavx,avx") + (set_attr "type" "ssecvt") (set_attr "mode" "V4SF")]) (define_insn "sse_cvtps2pi" - [(set (match_operand:V2SI 0 "register_operand" "=y") + [(set (match_operand:V2SI 0 "register_operand" "=y,Yv") (vec_select:V2SI - (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] + (unspec:V4SI [(match_operand:V4SF 1 "register_mmxmem_operand" "xm,YvBm")] UNSPEC_FIX_NOTRUNC) (parallel [(const_int 0) (const_int 1)])))] - "TARGET_SSE" - "cvtps2pi\t{%1, %0|%0, %q1}" - [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE" + "@ + cvtps2pi\t{%1, %0|%0, %q1} + %vcvtps2dq\t{%1, %0|%0, %1}" + [(set_attr "isa" "*,sse2") + (set_attr "mmx_isa" "native,*") + (set_attr "type" "ssecvt") + (set_attr "unit" "mmx,*") (set_attr "mode" "DI")]) (define_insn "sse_cvttps2pi" - [(set (match_operand:V2SI 0 "register_operand" "=y") + [(set (match_operand:V2SI 0 "register_operand" "=y,Yv") (vec_select:V2SI - (fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm")) + (fix:V4SI (match_operand:V4SF 1 "register_mmxmem_operand" "xm,YvBm")) (parallel [(const_int 0) (const_int 1)])))] - "TARGET_SSE" - "cvttps2pi\t{%1, %0|%0, %q1}" - [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSE" + "@ + cvttps2pi\t{%1, %0|%0, %q1} + %vcvttps2dq\t{%1, %0|%0, %1}" + [(set_attr "isa" "*,sse2") + (set_attr "mmx_isa" "native,*") + (set_attr "type" "ssecvt") + (set_attr "unit" "mmx,*") (set_attr "prefix_rep" "0") (set_attr "mode" "SF")]) @@ -4728,7 +5375,7 @@ (unspec:SWI48 [(match_operand:SF 1 "nonimmediate_operand" "v,m")] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE" - "%vcvtss2si<rex64suffix>\t{%1, %0|%0, %k1}" + "%vcvtss2si<rex64suffix>\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") (set_attr "amdfam10_decode" "double,double") @@ -4758,11 +5405,11 @@ (vec_merge:VF_128 (vec_duplicate:VF_128 (unsigned_float:<ssescalarmode> - (match_operand:SI 2 "<round_nimm_predicate>" "<round_constraint3>"))) + (match_operand:SI 2 "<round_nimm_scalar_predicate>" "<round_constraint3>"))) (match_operand:VF_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F && <round_modev4sf_condition>" - "vcvtusi2<ssescalarmodesuffix>\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}" + "vcvtusi2<ssescalarmodesuffix>{l}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}" [(set_attr "type" "sseicvt") (set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) @@ -4772,7 +5419,7 @@ (vec_merge:VF_128 (vec_duplicate:VF_128 (unsigned_float:<ssescalarmode> - (match_operand:DI 2 "<round_nimm_predicate>" "<round_constraint3>"))) + (match_operand:DI 2 "<round_nimm_scalar_predicate>" "<round_constraint3>"))) (match_operand:VF_128 1 "register_operand" "v") (const_int 1)))] "TARGET_AVX512F && TARGET_64BIT" @@ -4979,37 +5626,52 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "sse2_cvtpi2pd" - [(set (match_operand:V2DF 0 "register_operand" "=x,x") - (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "y,m")))] - "TARGET_SSE2" - "cvtpi2pd\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx,*") - (set_attr "prefix_data16" "1,*") + [(set (match_operand:V2DF 0 "register_operand" "=v,x") + (float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "vBm,?!y")))] + "TARGET_SSE2" + "@ + %vcvtdq2pd\t{%1, %0|%0, %1} + cvtpi2pd\t{%1, %0|%0, %1}" + [(set_attr "mmx_isa" "*,native") + (set_attr "type" "ssecvt") + (set_attr "unit" "*,mmx") + (set_attr "prefix_data16" "*,1") + (set_attr "prefix" "maybe_vex,*") (set_attr "mode" "V2DF")]) (define_insn "sse2_cvtpd2pi" - [(set (match_operand:V2SI 0 "register_operand" "=y") - (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")] + [(set (match_operand:V2SI 0 "register_operand" "=v,?!y") + (unspec:V2SI [(match_operand:V2DF 1 "vector_operand" "vBm,xBm")] UNSPEC_FIX_NOTRUNC))] "TARGET_SSE2" - "cvtpd2pi\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx") + "@ + * return TARGET_AVX ? \"vcvtpd2dq{x}\t{%1, %0|%0, %1}\" : \"cvtpd2dq\t{%1, %0|%0, %1}\"; + cvtpd2pi\t{%1, %0|%0, %1}" + [(set_attr "mmx_isa" "*,native") + (set_attr "type" "ssecvt") + (set_attr "unit" "*,mmx") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double") - (set_attr "btver2_decode" "direct") - (set_attr "prefix_data16" "1") - (set_attr "mode" "DI")]) + (set_attr "prefix_data16" "*,1") + (set_attr "prefix" "maybe_vex,*") + (set_attr "mode" "TI")]) (define_insn "sse2_cvttpd2pi" - [(set (match_operand:V2SI 0 "register_operand" "=y") - (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm")))] - "TARGET_SSE2" - "cvttpd2pi\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") - (set_attr "unit" "mmx") + [(set (match_operand:V2SI 0 "register_operand" "=v,?!y") + (fix:V2SI (match_operand:V2DF 1 "vector_operand" "vBm,xBm")))] + "TARGET_SSE2" + "@ + * return TARGET_AVX ? \"vcvttpd2dq{x}\t{%1, %0|%0, %1}\" : \"cvttpd2dq\t{%1, %0|%0, %1}\"; + cvttpd2pi\t{%1, %0|%0, %1}" + [(set_attr "mmx_isa" "*,native") + (set_attr "type" "ssecvt") + (set_attr "unit" "*,mmx") + (set_attr "amdfam10_decode" "double") + (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double") - (set_attr "prefix_data16" "1") + (set_attr "prefix_data16" "*,1") + (set_attr "prefix" "maybe_vex,*") (set_attr "mode" "TI")]) (define_insn "sse2_cvtsi2sd" @@ -5021,9 +5683,9 @@ (const_int 1)))] "TARGET_SSE2" "@ - cvtsi2sd\t{%2, %0|%0, %2} - cvtsi2sd\t{%2, %0|%0, %2} - vcvtsi2sd\t{%2, %1, %0|%0, %1, %2}" + cvtsi2sd{l}\t{%2, %0|%0, %2} + cvtsi2sd{l}\t{%2, %0|%0, %2} + vcvtsi2sd{l}\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,direct,*") @@ -5043,9 +5705,9 @@ (const_int 1)))] "TARGET_SSE2 && TARGET_64BIT" "@ - cvtsi2sdq\t{%2, %0|%0, %2} - cvtsi2sdq\t{%2, %0|%0, %2} - vcvtsi2sdq\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}" + cvtsi2sd{q}\t{%2, %0|%0, %2} + cvtsi2sd{q}\t{%2, %0|%0, %2} + vcvtsi2sd{q}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,direct,*") @@ -5203,11 +5865,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "float<floatunssuffix>v2div2sf2" +(define_expand "float<floatunssuffix>v2div2sf2" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_concat:V4SF (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) - (const_vector:V2SF [(const_int 0) (const_int 0)])))] + (match_dup 2)))] + "TARGET_AVX512DQ && TARGET_AVX512VL" + "operands[2] = CONST0_RTX (V2SFmode);") + +(define_insn "*float<floatunssuffix>v2div2sf2" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) + (match_operand:V2SF 2 "const0_operand" "C")))] "TARGET_AVX512DQ && TARGET_AVX512VL" "vcvt<floatsuffix>qq2ps{x}\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") @@ -5241,16 +5911,29 @@ DONE; }) -(define_insn "float<floatunssuffix>v2div2sf2_mask" +(define_expand "float<floatunssuffix>v2div2sf2_mask" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_concat:V4SF (vec_merge:V2SF - (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) + (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) (vec_select:V2SF (match_operand:V4SF 2 "nonimm_or_0_operand" "0C") (parallel [(const_int 0) (const_int 1)])) (match_operand:QI 3 "register_operand" "Yk")) - (const_vector:V2SF [(const_int 0) (const_int 0)])))] + (match_dup 4)))] + "TARGET_AVX512DQ && TARGET_AVX512VL" + "operands[4] = CONST0_RTX (V2SFmode);") + +(define_insn "*float<floatunssuffix>v2div2sf2_mask" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (vec_merge:V2SF + (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) + (vec_select:V2SF + (match_operand:V4SF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:V2SF 4 "const0_operand" "C")))] "TARGET_AVX512DQ && TARGET_AVX512VL" "vcvt<floatsuffix>qq2ps{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" [(set_attr "type" "ssecvt") @@ -5263,9 +5946,9 @@ (vec_merge:V2SF (any_float:V2SF (match_operand:V2DI 1 "nonimmediate_operand" "vm")) - (const_vector:V2SF [(const_int 0) (const_int 0)]) + (match_operand:V2SF 3 "const0_operand" "C") (match_operand:QI 2 "register_operand" "Yk")) - (const_vector:V2SF [(const_int 0) (const_int 0)])))] + (match_operand:V2SF 4 "const0_operand" "C")))] "TARGET_AVX512DQ && TARGET_AVX512VL" "vcvt<floatsuffix>qq2ps{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" [(set_attr "type" "ssecvt") @@ -5377,16 +6060,16 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) -(define_insn "sse2_cvtpd2dq<mask_name>" +(define_insn "sse2_cvtpd2dq" [(set (match_operand:V4SI 0 "register_operand" "=v") (vec_concat:V4SI (unspec:V2SI [(match_operand:V2DF 1 "vector_operand" "vBm")] UNSPEC_FIX_NOTRUNC) (const_vector:V2SI [(const_int 0) (const_int 0)])))] - "TARGET_SSE2 && <mask_avx512vl_condition>" + "TARGET_SSE2" { if (TARGET_AVX) - return "vcvtpd2dq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"; + return "vcvtpd2dq{x}\t{%1, %0|%0, %1}"; else return "cvtpd2dq\t{%1, %0|%0, %1}"; } @@ -5399,6 +6082,38 @@ (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double")]) +(define_insn "sse2_cvtpd2dq_mask" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vm")] + UNSPEC_FIX_NOTRUNC) + (vec_select:V2SI + (match_operand:V4SI 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvtpd2dq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_cvtpd2dq_mask_1" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vm")] + UNSPEC_FIX_NOTRUNC) + (const_vector:V2SI [(const_int 0) (const_int 0)]) + (match_operand:QI 2 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvtpd2dq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + ;; For ufix_notrunc* insn patterns (define_mode_attr pd2udqsuff [(V8DF "") (V4DF "{y}")]) @@ -5414,15 +6129,49 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "ufix_notruncv2dfv2si2<mask_name>" +(define_insn "ufix_notruncv2dfv2si2" [(set (match_operand:V4SI 0 "register_operand" "=v") (vec_concat:V4SI (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "vm")] - UNSPEC_UNSIGNED_FIX_NOTRUNC) + UNSPEC_UNSIGNED_FIX_NOTRUNC) (const_vector:V2SI [(const_int 0) (const_int 0)])))] "TARGET_AVX512VL" - "vcvtpd2udq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvtpd2udq{x}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "ufix_notruncv2dfv2si2_mask" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (unspec:V2SI + [(match_operand:V2DF 1 "nonimmediate_operand" "vm")] + UNSPEC_UNSIGNED_FIX_NOTRUNC) + (vec_select:V2SI + (match_operand:V4SI 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvtpd2udq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*ufix_notruncv2dfv2si2_mask_1" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (unspec:V2SI + [(match_operand:V2DF 1 "nonimmediate_operand" "vm")] + UNSPEC_UNSIGNED_FIX_NOTRUNC) + (const_vector:V2SI [(const_int 0) (const_int 0)]) + (match_operand:QI 2 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvtpd2udq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "TI")]) @@ -5437,13 +6186,43 @@ (set_attr "prefix" "evex") (set_attr "mode" "OI")]) -(define_insn "ufix_truncv2dfv2si2<mask_name>" +(define_insn "ufix_truncv2dfv2si2" [(set (match_operand:V4SI 0 "register_operand" "=v") (vec_concat:V4SI (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm")) (const_vector:V2SI [(const_int 0) (const_int 0)])))] "TARGET_AVX512VL" - "vcvttpd2udq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + "vcvttpd2udq{x}\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "ufix_truncv2dfv2si2_mask" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm")) + (vec_select:V2SI + (match_operand:V4SI 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvttpd2udq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*ufix_truncv2dfv2si2_mask_1" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (unsigned_fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm")) + (const_vector:V2SI [(const_int 0) (const_int 0)]) + (match_operand:QI 2 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvttpd2udq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "TI")]) @@ -5588,15 +6367,15 @@ "TARGET_AVX" "operands[2] = CONST0_RTX (V4SImode);") -(define_insn "sse2_cvttpd2dq<mask_name>" +(define_insn "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "=v") (vec_concat:V4SI (fix:V2SI (match_operand:V2DF 1 "vector_operand" "vBm")) (const_vector:V2SI [(const_int 0) (const_int 0)])))] - "TARGET_SSE2 && <mask_avx512vl_condition>" + "TARGET_SSE2" { if (TARGET_AVX) - return "vcvttpd2dq{x}\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"; + return "vcvttpd2dq{x}\t{%1, %0|%0, %1}"; else return "cvttpd2dq\t{%1, %0|%0, %1}"; } @@ -5607,6 +6386,36 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "sse2_cvttpd2dq_mask" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm")) + (vec_select:V2SI + (match_operand:V4SI 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvttpd2dq{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +(define_insn "*sse2_cvttpd2dq_mask_1" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (vec_concat:V4SI + (vec_merge:V2SI + (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "vm")) + (const_vector:V2SI [(const_int 0) (const_int 0)]) + (match_operand:QI 2 "register_operand" "Yk")) + (const_vector:V2SI [(const_int 0) (const_int 0)])))] + "TARGET_AVX512VL" + "vcvttpd2dq{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + (define_insn "sse2_cvtsd2ss<round_name>" [(set (match_operand:V4SF 0 "register_operand" "=x,x,v") (vec_merge:V4SF @@ -5726,26 +6535,28 @@ (define_expand "sse2_cvtpd2ps_mask" [(set (match_operand:V4SF 0 "register_operand") - (vec_merge:V4SF - (vec_concat:V4SF + (vec_concat:V4SF + (vec_merge:V2SF (float_truncate:V2SF (match_operand:V2DF 1 "vector_operand")) - (match_dup 4)) - (match_operand:V4SF 2 "register_operand") - (match_operand:QI 3 "register_operand")))] + (vec_select:V2SF + (match_operand:V4SF 2 "nonimm_or_0_operand") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand")) + (match_dup 4)))] "TARGET_SSE2" "operands[4] = CONST0_RTX (V2SFmode);") -(define_insn "*sse2_cvtpd2ps<mask_name>" +(define_insn "*sse2_cvtpd2ps" [(set (match_operand:V4SF 0 "register_operand" "=v") (vec_concat:V4SF (float_truncate:V2SF (match_operand:V2DF 1 "vector_operand" "vBm")) - (match_operand:V2SF 2 "const0_operand")))] - "TARGET_SSE2 && <mask_avx512vl_condition>" + (match_operand:V2SF 2 "const0_operand" "C")))] + "TARGET_SSE2" { if (TARGET_AVX) - return "vcvtpd2ps{x}\t{%1, %0<mask_operand3>|%0<mask_operand3>, %1}"; + return "vcvtpd2ps{x}\t{%1, %0|%0, %1}"; else return "cvtpd2ps\t{%1, %0|%0, %1}"; } @@ -5757,6 +6568,38 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "V4SF")]) +(define_insn "*sse2_cvtpd2ps_mask" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (vec_merge:V2SF + (float_truncate:V2SF + (match_operand:V2DF 1 "nonimmediate_operand" "vm")) + (vec_select:V2SF + (match_operand:V4SF 2 "nonimm_or_0_operand" "0C") + (parallel [(const_int 0) (const_int 1)])) + (match_operand:QI 3 "register_operand" "Yk")) + (match_operand:V2SF 4 "const0_operand" "C")))] + "TARGET_AVX512VL" + "vcvtpd2ps{x}\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "V4SF")]) + +(define_insn "*sse2_cvtpd2ps_mask_1" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (vec_merge:V2SF + (float_truncate:V2SF + (match_operand:V2DF 1 "nonimmediate_operand" "vm")) + (match_operand:V2SF 3 "const0_operand" "C") + (match_operand:QI 2 "register_operand" "Yk")) + (match_operand:V2SF 4 "const0_operand" "C")))] + "TARGET_AVX512VL" + "vcvtpd2ps{x}\t{%1, %0%{%2%}%{z%}|%0%{%2%}%{z%}, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "evex") + (set_attr "mode" "V4SF")]) + ;; For <sse2_avx_avx512f>_cvtps2pd<avxsizesuffix> insn pattern (define_mode_attr sf2dfmode [(V8DF "V8SF") (V4DF "V4SF")]) @@ -5800,7 +6643,7 @@ (set_attr "mode" "V8DF")]) (define_insn "<avx512>_cvt<ssemodesuffix>2mask<mode>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI12_AVX512VL 1 "register_operand" "v")] UNSPEC_CVTINT2MASK))] @@ -5810,7 +6653,7 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<avx512>_cvt<ssemodesuffix>2mask<mode>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_AVX512VL 1 "register_operand" "v")] UNSPEC_CVTINT2MASK))] @@ -5836,7 +6679,7 @@ (vec_merge:VI12_AVX512VL (match_operand:VI12_AVX512VL 2 "vector_all_ones_operand") (match_operand:VI12_AVX512VL 3 "const0_operand") - (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))] + (match_operand:<avx512fmaskmode> 1 "register_operand" "k")))] "TARGET_AVX512BW" "vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}" [(set_attr "prefix" "evex") @@ -5848,21 +6691,25 @@ (match_dup 2) (match_dup 3) (match_operand:<avx512fmaskmode> 1 "register_operand")))] - "TARGET_AVX512DQ" + "TARGET_AVX512F" "{ operands[2] = CONSTM1_RTX (<MODE>mode); operands[3] = CONST0_RTX (<MODE>mode); }") (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>" - [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VI48_AVX512VL (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand") (match_operand:VI48_AVX512VL 3 "const0_operand") - (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))] - "TARGET_AVX512DQ" - "vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}" - [(set_attr "prefix" "evex") + (match_operand:<avx512fmaskmode> 1 "register_operand" "k,Yk")))] + "TARGET_AVX512F" + "@ + vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1} + vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}" + [(set_attr "isa" "avx512dq,*") + (set_attr "length_immediate" "0,1") + (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) (define_insn "sse2_cvtps2pd<mask_name>" @@ -7190,6 +8037,10 @@ (const_string "mmxmov") ] (const_string "sselog"))) + (set (attr "mmx_isa") + (if_then_else (eq_attr "alternative" "7,8") + (const_string "native") + (const_string "*"))) (set (attr "prefix_data16") (if_then_else (eq_attr "alternative" "3,4") (const_string "1") @@ -7225,7 +8076,8 @@ movss\t{%1, %0|%0, %1} punpckldq\t{%2, %0|%0, %2} movd\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") + [(set_attr "mmx_isa" "*,*,native,native") + (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov") (set_attr "mode" "V4SF,SF,DI,DI")]) (define_insn "*vec_concatv4sf" @@ -7244,6 +8096,17 @@ (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) +(define_insn "*vec_concatv4sf_0" + [(set (match_operand:V4SF 0 "register_operand" "=v") + (vec_concat:V4SF + (match_operand:V2SF 1 "nonimmediate_operand" "xm") + (match_operand:V2SF 2 "const0_operand" " C")))] + "TARGET_SSE2" + "%vmovq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "DF")]) + ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set<mode>_0" @@ -7355,7 +8218,7 @@ [(set (match_operand:VI4F_256_512 0 "register_operand" "=v,v,v") (vec_merge:VI4F_256_512 (vec_duplicate:VI4F_256_512 - (match_operand:<ssescalarmode> 2 "general_operand" "v,m,r")) + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "v,m,r")) (match_operand:VI4F_256_512 1 "const0_operand" "C,C,C") (const_int 1)))] "TARGET_AVX" @@ -7420,6 +8283,25 @@ [(set (match_dup 0) (match_dup 1))] "operands[0] = adjust_address (operands[0], <ssescalarmode>mode, 0);") +;; Standard scalar operation patterns which preserve the rest of the +;; vector for combiner. +(define_insn "vec_setv2df_0" + [(set (match_operand:V2DF 0 "register_operand" "=x,v,x,v") + (vec_merge:V2DF + (vec_duplicate:V2DF + (match_operand:DF 2 "nonimmediate_operand" " x,v,m,m")) + (match_operand:V2DF 1 "register_operand" " 0,v,0,v") + (const_int 1)))] + "TARGET_SSE2" + "@ + movsd\t{%2, %0|%0, %2} + vmovsd\t{%2, %1, %0|%0, %1, %2} + movlpd\t{%2, %0|%0, %2} + vmovlpd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + (define_expand "vec_set<mode>" [(match_operand:V 0 "register_operand") (match_operand:<ssescalarmode> 1 "register_operand") @@ -7837,13 +8719,16 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_mode_iterator VI48F_256_DQ + [V8SI V8SF (V4DI "TARGET_AVX512DQ") (V4DF "TARGET_AVX512DQ")]) + (define_expand "avx512vl_vextractf128<mode>" [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand") - (match_operand:VI48F_256 1 "register_operand") + (match_operand:VI48F_256_DQ 1 "register_operand") (match_operand:SI 2 "const_0_to_1_operand") (match_operand:<ssehalfvecmode> 3 "nonimm_or_0_operand") (match_operand:QI 4 "register_operand")] - "TARGET_AVX512DQ && TARGET_AVX512VL" + "TARGET_AVX512VL" { rtx (*insn)(rtx, rtx, rtx, rtx); rtx dest = operands[0]; @@ -7911,14 +8796,19 @@ (const_int 4) (const_int 5) (const_int 6) (const_int 7)])))] "TARGET_AVX512F - && <mask_mode512bit_condition> + && <mask_avx512dq_condition> && (<mask_applied> || !(MEM_P (operands[0]) && MEM_P (operands[1])))" { if (<mask_applied> || (!TARGET_AVX512VL && !REG_P (operands[0]) && EXT_REX_SSE_REG_P (operands[1]))) - return "vextract<shuffletype>32x8\t{$0x0, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x0}"; + { + if (TARGET_AVX512DQ) + return "vextract<shuffletype>32x8\t{$0x0, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x0}"; + else + return "vextract<shuffletype>64x4\t{$0x0, %1, %0|%0, %1, 0x0}"; + } else return "#"; } @@ -8028,7 +8918,7 @@ (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)])))] "TARGET_AVX - && <mask_avx512vl_condition> && <mask_avx512dq_condition> + && <mask_avx512vl_condition> && (<mask_applied> || !(MEM_P (operands[0]) && MEM_P (operands[1])))" { if (<mask_applied>) @@ -8346,9 +9236,9 @@ (define_expand "vec_extract<mode><ssehalfvecmodelower>" [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand") - (match_operand:V_512 1 "register_operand") + (match_operand:V_256_512 1 "register_operand") (match_operand 2 "const_0_to_1_operand")] - "TARGET_AVX512F" + "TARGET_AVX" { if (INTVAL (operands[2])) emit_insn (gen_vec_extract_hi_<mode> (operands[0], operands[1])); @@ -8876,7 +9766,7 @@ (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "0") (match_operand:VF_128 2 "register_operand" "v") - (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) (match_dup 1) @@ -8893,7 +9783,7 @@ (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "0") (match_operand:VF_128 2 "register_operand" "v") - (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_FIXUPIMM) (match_dup 1) @@ -8917,18 +9807,33 @@ (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512f_rndscale<mode><round_saeonly_name>" +(define_insn "avx512f_rndscale<mode><mask_scalar_name><round_saeonly_scalar_name>" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + [(match_operand:VF_128 2 "<round_saeonly_scalar_nimm_predicate>" "<round_saeonly_scalar_constraint>") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_ROUND) - (match_dup 1) - (const_int 1)))] - "TARGET_AVX512F" - "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %<iptr>2<round_saeonly_op4>, %3}" + (match_operand:VF_128 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512F" + "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %<iptr>2<round_saeonly_scalar_mask_op4>, %3}" + [(set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "*avx512f_rndscale<mode><round_saeonly_name>" + [(set (match_operand:VF_128 0 "register_operand" "=v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (unspec:<ssescalarmode> + [(match_operand:<ssescalarmode> 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>") + (match_operand:SI 3 "const_0_to_255_operand")] + UNSPEC_ROUND)) + (match_operand:VF_128 1 "register_operand" "v") + (const_int 1)))] + "TARGET_AVX512F" + "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}" [(set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) @@ -9301,7 +10206,7 @@ "!TARGET_SSE2 && TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "@ - movhps\t{%1, %0|%q0, %1} + movhps\t{%1, %0|%0, %1} movhlps\t{%1, %0|%0, %1} movlps\t{%H1, %0|%0, %H1}" [(set_attr "type" "ssemov") @@ -9553,7 +10458,7 @@ (define_insn "vec_concatv2df" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v,x,v,x,x, v,x,x") (vec_concat:V2DF - (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,xm,0,0") + (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,vm,0,0") (match_operand:DF 2 "nonimm_or_0_operand" " x,x,v,1,1,m,m, C,x,m")))] "TARGET_SSE && (!(MEM_P (operands[1]) && MEM_P (operands[2])) @@ -9607,7 +10512,7 @@ [(set (match_operand:VF2_512_256 0 "register_operand" "=v") (vec_merge:VF2_512_256 (vec_duplicate:VF2_512_256 - (match_operand:<ssescalarmode> 2 "general_operand" "xm")) + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm")) (match_operand:VF2_512_256 1 "const0_operand" "C") (const_int 1)))] "TARGET_AVX" @@ -11123,7 +12028,7 @@ (const_int 1))))] "TARGET_SSE2" { - operands[3] = CONST1_RTX(<MODE>mode); + operands[3] = CONST1_RTX(<ssedoublemode>mode); ix86_fixup_binary_operands_no_copy (PLUS, <MODE>mode, operands); }) @@ -11267,12 +12172,25 @@ (set_attr "mode" "<sseinsnmode>")]) +(define_expand "vec_shl_<mode>" + [(set (match_dup 3) + (ashift:V1TI + (match_operand:V_128 1 "register_operand") + (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) + (set (match_operand:V_128 0 "register_operand") (match_dup 4))] + "TARGET_SSE2" +{ + operands[1] = gen_lowpart (V1TImode, operands[1]); + operands[3] = gen_reg_rtx (V1TImode); + operands[4] = gen_lowpart (<MODE>mode, operands[3]); +}) + (define_expand "vec_shr_<mode>" [(set (match_dup 3) (lshiftrt:V1TI - (match_operand:VI_128 1 "register_operand") + (match_operand:V_128 1 "register_operand") (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) - (set (match_operand:VI_128 0 "register_operand") (match_dup 4))] + (set (match_operand:V_128 0 "register_operand") (match_dup 4))] "TARGET_SSE2" { operands[1] = gen_lowpart (V1TImode, operands[1]); @@ -11642,7 +12560,7 @@ "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);") (define_insn "<avx512>_eq<mode>3<mask_scalar_merge_name>_1" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk,Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k") (unspec:<avx512fmaskmode> [(match_operand:VI12_AVX512VL 1 "nonimm_or_0_operand" "%v,v") (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "vm,C")] @@ -11657,7 +12575,7 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<avx512>_eq<mode>3<mask_scalar_merge_name>_1" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk,Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k,k") (unspec:<avx512fmaskmode> [(match_operand:VI48_AVX512VL 1 "nonimm_or_0_operand" "%v,v") (match_operand:VI48_AVX512VL 2 "nonimm_or_0_operand" "vm,C")] @@ -11748,7 +12666,7 @@ (set_attr "mode" "OI")]) (define_insn "<avx512>_gt<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI48_AVX512VL 1 "register_operand" "v") (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))] @@ -11760,7 +12678,7 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<avx512>_gt<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI12_AVX512VL 1 "register_operand" "v") (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))] @@ -11965,8 +12883,21 @@ (match_dup 2)))] "TARGET_SSE" { - operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode)); -}) + if (!TARGET_AVX512F) + operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode)); + else + operands[2] = CONSTM1_RTX (<MODE>mode); +}) + +(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>" + [(set (match_operand:VI 0 "register_operand" "=v") + (xor:VI (match_operand:VI 1 "nonimmediate_operand" "vm") + (match_operand:VI 2 "vector_all_ones_operand" "BC")))] + "TARGET_AVX512F" + "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}" + [(set_attr "type" "sselog") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) (define_expand "<sse2_avx2>_andnot<mode>3" [(set (match_operand:VI_AVX2 0 "register_operand") @@ -12004,7 +12935,7 @@ (match_operand:VI 2 "vector_operand" "xBm,xm,vm")))] "TARGET_SSE" { - static char buf[64]; + char buf[64]; const char *ops; const char *tmp; const char *ssesuffix; @@ -12074,7 +13005,8 @@ } snprintf (buf, sizeof (buf), ops, tmp, ssesuffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx") (set_attr "type" "sselog") @@ -12086,10 +13018,7 @@ (const_string "*"))) (set_attr "prefix" "orig,vex,evex") (set (attr "mode") - (cond [(and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "<ssePSmode>") - (match_test "TARGET_AVX2") + (cond [(match_test "TARGET_AVX2") (const_string "<sseinsnmode>") (match_test "TARGET_AVX") (if_then_else @@ -12103,8 +13032,8 @@ (const_string "<sseinsnmode>")))]) (define_insn "*andnot<mode>3_bcst" - [(set (match_operand:VI 0 "register_operand" "=v") - (and:VI + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") + (and:VI48_AVX512VL (not:VI48_AVX512VL (match_operand:VI48_AVX512VL 1 "register_operand" "v")) (vec_duplicate:VI48_AVX512VL @@ -12149,7 +13078,7 @@ "TARGET_SSE && <mask_mode512bit_condition> && !(MEM_P (operands[1]) && MEM_P (operands[2]))" { - static char buf[64]; + char buf[64]; const char *ops; const char *tmp; const char *ssesuffix; @@ -12214,7 +13143,8 @@ } snprintf (buf, sizeof (buf), ops, tmp, ssesuffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx") (set_attr "type" "sselog") @@ -12226,10 +13156,7 @@ (const_string "*"))) (set_attr "prefix" "<mask_prefix3>,evex") (set (attr "mode") - (cond [(and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "<ssePSmode>") - (match_test "TARGET_AVX2") + (cond [(match_test "TARGET_AVX2") (const_string "<sseinsnmode>") (match_test "TARGET_AVX") (if_then_else @@ -12249,7 +13176,7 @@ (match_operand:VI12_AVX_AVX512F 2 "vector_operand" "xBm,xm,vm")))] "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2]))" { - static char buf[64]; + char buf[64]; const char *ops; const char *tmp; const char *ssesuffix; @@ -12309,7 +13236,8 @@ } snprintf (buf, sizeof (buf), ops, tmp, ssesuffix); - return buf; + output_asm_insn (buf, operands); + return ""; } [(set_attr "isa" "noavx,avx,avx") (set_attr "type" "sselog") @@ -12321,10 +13249,7 @@ (const_string "*"))) (set_attr "prefix" "orig,vex,evex") (set (attr "mode") - (cond [(and (match_test "<MODE_SIZE> == 16") - (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")) - (const_string "<ssePSmode>") - (match_test "TARGET_AVX2") + (cond [(match_test "TARGET_AVX2") (const_string "<sseinsnmode>") (match_test "TARGET_AVX") (if_then_else @@ -12349,22 +13274,22 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "<avx512>_testm<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") - (unspec:<avx512fmaskmode> - [(match_operand:VI12_AVX512VL 1 "register_operand" "v") - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")] - UNSPEC_TESTM))] - "TARGET_AVX512BW" - "vptestm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" - [(set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) +(define_mode_iterator VI1248_AVX512VLBW + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW") + (V16QI "TARGET_AVX512VL && TARGET_AVX512BW") + (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX512VL && TARGET_AVX512BW") + (V8HI "TARGET_AVX512VL && TARGET_AVX512BW") + V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") + V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) + +(define_mode_iterator AVX512ZEXTMASK + [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI]) (define_insn "<avx512>_testm<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> - [(match_operand:VI48_AVX512VL 1 "register_operand" "v") - (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")] + [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v") + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")] UNSPEC_TESTM))] "TARGET_AVX512F" "vptestm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" @@ -12372,27 +13297,76 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<avx512>_testnm<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> - [(match_operand:VI12_AVX512VL 1 "register_operand" "v") - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")] - UNSPEC_TESTNM))] - "TARGET_AVX512BW" - "vptestnm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" - [(set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "<avx512>_testnm<mode>3<mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") - (unspec:<avx512fmaskmode> - [(match_operand:VI48_AVX512VL 1 "register_operand" "v") - (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm")] + [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v") + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")] UNSPEC_TESTNM))] "TARGET_AVX512F" "vptestnm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +(define_insn "*<avx512>_testm<VI1248_AVX512VLBW:mode>3_zext" + [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k") + (zero_extend:AVX512ZEXTMASK + (unspec:<VI1248_AVX512VLBW:avx512fmaskmode> + [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v") + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")] + UNSPEC_TESTM)))] + "TARGET_AVX512BW + && (<AVX512ZEXTMASK:MODE_SIZE> + > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))" + "vptestm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<VI1248_AVX512VLBW:sseinsnmode>")]) + +(define_insn "*<avx512>_testm<VI1248_AVX512VLBW:mode>3_zext_mask" + [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k") + (zero_extend:AVX512ZEXTMASK + (and:<VI1248_AVX512VLBW:avx512fmaskmode> + (unspec:<VI1248_AVX512VLBW:avx512fmaskmode> + [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v") + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")] + UNSPEC_TESTM) + (match_operand:<VI1248_AVX512VLBW:avx512fmaskmode> 3 "register_operand" "Yk"))))] + "TARGET_AVX512BW + && (<AVX512ZEXTMASK:MODE_SIZE> + > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))" + "vptestm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<VI1248_AVX512VLBW:sseinsnmode>")]) + +(define_insn "*<avx512>_testnm<VI1248_AVX512VLBW:mode>3_zext" + [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k") + (zero_extend:AVX512ZEXTMASK + (unspec:<VI1248_AVX512VLBW:avx512fmaskmode> + [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v") + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")] + UNSPEC_TESTNM)))] + "TARGET_AVX512BW + && (<AVX512ZEXTMASK:MODE_SIZE> + > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))" + "vptestnm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<VI1248_AVX512VLBW:sseinsnmode>")]) + +(define_insn "*<avx512>_testnm<VI1248_AVX512VLBW:mode>3_zext_mask" + [(set (match_operand:AVX512ZEXTMASK 0 "register_operand" "=k") + (zero_extend:AVX512ZEXTMASK + (and:<VI1248_AVX512VLBW:avx512fmaskmode> + (unspec:<VI1248_AVX512VLBW:avx512fmaskmode> + [(match_operand:VI1248_AVX512VLBW 1 "register_operand" "v") + (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand" "vm")] + UNSPEC_TESTNM) + (match_operand:<VI1248_AVX512VLBW:avx512fmaskmode> 3 "register_operand" "Yk"))))] + "TARGET_AVX512BW + && (<AVX512ZEXTMASK:MODE_SIZE> + > GET_MODE_SIZE (<VI1248_AVX512VLBW:avx512fmaskmode>mode))" + "vptestnm<VI1248_AVX512VLBW:ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}" + [(set_attr "prefix" "evex") + (set_attr "mode" "<VI1248_AVX512VLBW:sseinsnmode>")]) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel integral element swizzling @@ -12412,22 +13386,59 @@ }) (define_expand "vec_pack_trunc_qi" - [(set (match_operand:HI 0 ("register_operand")) - (ior:HI (ashift:HI (zero_extend:HI (match_operand:QI 2 ("register_operand"))) + [(set (match_operand:HI 0 "register_operand") + (ior:HI (ashift:HI (zero_extend:HI (match_operand:QI 2 "register_operand")) (const_int 8)) - (zero_extend:HI (match_operand:QI 1 ("register_operand")))))] + (zero_extend:HI (match_operand:QI 1 "register_operand"))))] "TARGET_AVX512F") (define_expand "vec_pack_trunc_<mode>" - [(set (match_operand:<DOUBLEMASKMODE> 0 ("register_operand")) - (ior:<DOUBLEMASKMODE> (ashift:<DOUBLEMASKMODE> (zero_extend:<DOUBLEMASKMODE> (match_operand:SWI24 2 ("register_operand"))) - (match_dup 3)) - (zero_extend:<DOUBLEMASKMODE> (match_operand:SWI24 1 ("register_operand")))))] + [(set (match_operand:<DOUBLEMASKMODE> 0 "register_operand") + (ior:<DOUBLEMASKMODE> + (ashift:<DOUBLEMASKMODE> + (zero_extend:<DOUBLEMASKMODE> + (match_operand:SWI24 2 "register_operand")) + (match_dup 3)) + (zero_extend:<DOUBLEMASKMODE> + (match_operand:SWI24 1 "register_operand"))))] "TARGET_AVX512BW" { operands[3] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)); }) +(define_expand "vec_pack_sbool_trunc_qi" + [(match_operand:QI 0 "register_operand") + (match_operand:QI 1 "register_operand") + (match_operand:QI 2 "register_operand") + (match_operand:QI 3 "const_int_operand")] + "TARGET_AVX512F" +{ + HOST_WIDE_INT nunits = INTVAL (operands[3]); + rtx mask, tem1, tem2; + if (nunits != 8 && nunits != 4) + FAIL; + mask = gen_reg_rtx (QImode); + emit_move_insn (mask, GEN_INT ((1 << (nunits / 2)) - 1)); + tem1 = gen_reg_rtx (QImode); + emit_insn (gen_kandqi (tem1, operands[1], mask)); + if (TARGET_AVX512DQ) + { + tem2 = gen_reg_rtx (QImode); + emit_insn (gen_kashiftqi (tem2, operands[2], + GEN_INT (nunits / 2))); + } + else + { + tem2 = gen_reg_rtx (HImode); + emit_insn (gen_kashifthi (tem2, lowpart_subreg (HImode, operands[2], + QImode), + GEN_INT (nunits / 2))); + tem2 = lowpart_subreg (QImode, tem2, HImode); + } + emit_insn (gen_kiorqi (operands[0], tem1, tem2)); + DONE; +}) + (define_insn "<sse2_avx2>_packsswb<mask_name>" [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x,v") (vec_concat:VI1_AVX512 @@ -13063,15 +14074,29 @@ switch (<MODE>mode) { case E_V8DFmode: - return "vmovapd\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return "vmovupd\t{%2, %x0|%x0, %2}"; + else + return "vmovapd\t{%2, %x0|%x0, %2}"; case E_V16SFmode: - return "vmovaps\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return "vmovups\t{%2, %x0|%x0, %2}"; + else + return "vmovaps\t{%2, %x0|%x0, %2}"; case E_V8DImode: - return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}" - : "vmovdqa\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}" + : "vmovdqu\t{%2, %x0|%x0, %2}"; + else + return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}" + : "vmovdqa\t{%2, %x0|%x0, %2}"; case E_V16SImode: - return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}" - : "vmovdqa\t{%2, %x0|%x0, %2}"; + if (misaligned_operand (operands[2], <ssequartermode>mode)) + return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}" + : "vmovdqu\t{%2, %x0|%x0, %2}"; + else + return which_alternative == 2 ? "vmovdqa32\t{%2, %x0|%x0, %2}" + : "vmovdqa\t{%2, %x0|%x0, %2}"; default: gcc_unreachable (); } @@ -13224,13 +14249,15 @@ (vec_concat:<ssedoublemode> (match_operand:VI8F_256 1 "register_operand" "v") (match_operand:VI8F_256 2 "nonimmediate_operand" "vm")) - (parallel [(match_operand 3 "const_0_to_3_operand") - (match_operand 4 "const_0_to_3_operand") - (match_operand 5 "const_4_to_7_operand") - (match_operand 6 "const_4_to_7_operand")])))] + (parallel [(match_operand 3 "const_0_to_3_operand") + (match_operand 4 "const_0_to_3_operand") + (match_operand 5 "const_4_to_7_operand") + (match_operand 6 "const_4_to_7_operand")])))] "TARGET_AVX512VL - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) - && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1))" + && (INTVAL (operands[3]) & 1) == 0 + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 + && (INTVAL (operands[5]) & 1) == 0 + && INTVAL (operands[5]) == INTVAL (operands[6]) - 1" { int mask; mask = INTVAL (operands[3]) / 2; @@ -13273,19 +14300,23 @@ (vec_concat:<ssedoublemode> (match_operand:V8FI 1 "register_operand" "v") (match_operand:V8FI 2 "nonimmediate_operand" "vm")) - (parallel [(match_operand 3 "const_0_to_7_operand") - (match_operand 4 "const_0_to_7_operand") - (match_operand 5 "const_0_to_7_operand") - (match_operand 6 "const_0_to_7_operand") - (match_operand 7 "const_8_to_15_operand") - (match_operand 8 "const_8_to_15_operand") - (match_operand 9 "const_8_to_15_operand") - (match_operand 10 "const_8_to_15_operand")])))] + (parallel [(match_operand 3 "const_0_to_7_operand") + (match_operand 4 "const_0_to_7_operand") + (match_operand 5 "const_0_to_7_operand") + (match_operand 6 "const_0_to_7_operand") + (match_operand 7 "const_8_to_15_operand") + (match_operand 8 "const_8_to_15_operand") + (match_operand 9 "const_8_to_15_operand") + (match_operand 10 "const_8_to_15_operand")])))] "TARGET_AVX512F - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) - && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1) - && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) - && INTVAL (operands[9]) == (INTVAL (operands[10]) - 1))" + && (INTVAL (operands[3]) & 1) == 0 + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 + && (INTVAL (operands[5]) & 1) == 0 + && INTVAL (operands[5]) == INTVAL (operands[6]) - 1 + && (INTVAL (operands[7]) & 1) == 0 + && INTVAL (operands[7]) == INTVAL (operands[8]) - 1 + && (INTVAL (operands[9]) & 1) == 0 + && INTVAL (operands[9]) == INTVAL (operands[10]) - 1" { int mask; mask = INTVAL (operands[3]) / 2; @@ -13331,21 +14362,23 @@ (vec_concat:<ssedoublemode> (match_operand:VI4F_256 1 "register_operand" "v") (match_operand:VI4F_256 2 "nonimmediate_operand" "vm")) - (parallel [(match_operand 3 "const_0_to_7_operand") - (match_operand 4 "const_0_to_7_operand") - (match_operand 5 "const_0_to_7_operand") - (match_operand 6 "const_0_to_7_operand") - (match_operand 7 "const_8_to_15_operand") - (match_operand 8 "const_8_to_15_operand") - (match_operand 9 "const_8_to_15_operand") + (parallel [(match_operand 3 "const_0_to_7_operand") + (match_operand 4 "const_0_to_7_operand") + (match_operand 5 "const_0_to_7_operand") + (match_operand 6 "const_0_to_7_operand") + (match_operand 7 "const_8_to_15_operand") + (match_operand 8 "const_8_to_15_operand") + (match_operand 9 "const_8_to_15_operand") (match_operand 10 "const_8_to_15_operand")])))] "TARGET_AVX512VL - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) - && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2) - && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3) - && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) - && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2) - && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3))" + && (INTVAL (operands[3]) & 3) == 0 + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 + && INTVAL (operands[3]) == INTVAL (operands[5]) - 2 + && INTVAL (operands[3]) == INTVAL (operands[6]) - 3 + && (INTVAL (operands[7]) & 3) == 0 + && INTVAL (operands[7]) == INTVAL (operands[8]) - 1 + && INTVAL (operands[7]) == INTVAL (operands[9]) - 2 + && INTVAL (operands[7]) == INTVAL (operands[10]) - 3" { int mask; mask = INTVAL (operands[3]) / 4; @@ -13397,35 +14430,39 @@ (vec_concat:<ssedoublemode> (match_operand:V16FI 1 "register_operand" "v") (match_operand:V16FI 2 "nonimmediate_operand" "vm")) - (parallel [(match_operand 3 "const_0_to_15_operand") - (match_operand 4 "const_0_to_15_operand") - (match_operand 5 "const_0_to_15_operand") - (match_operand 6 "const_0_to_15_operand") - (match_operand 7 "const_0_to_15_operand") - (match_operand 8 "const_0_to_15_operand") - (match_operand 9 "const_0_to_15_operand") - (match_operand 10 "const_0_to_15_operand") - (match_operand 11 "const_16_to_31_operand") - (match_operand 12 "const_16_to_31_operand") - (match_operand 13 "const_16_to_31_operand") - (match_operand 14 "const_16_to_31_operand") - (match_operand 15 "const_16_to_31_operand") - (match_operand 16 "const_16_to_31_operand") - (match_operand 17 "const_16_to_31_operand") - (match_operand 18 "const_16_to_31_operand")])))] + (parallel [(match_operand 3 "const_0_to_15_operand") + (match_operand 4 "const_0_to_15_operand") + (match_operand 5 "const_0_to_15_operand") + (match_operand 6 "const_0_to_15_operand") + (match_operand 7 "const_0_to_15_operand") + (match_operand 8 "const_0_to_15_operand") + (match_operand 9 "const_0_to_15_operand") + (match_operand 10 "const_0_to_15_operand") + (match_operand 11 "const_16_to_31_operand") + (match_operand 12 "const_16_to_31_operand") + (match_operand 13 "const_16_to_31_operand") + (match_operand 14 "const_16_to_31_operand") + (match_operand 15 "const_16_to_31_operand") + (match_operand 16 "const_16_to_31_operand") + (match_operand 17 "const_16_to_31_operand") + (match_operand 18 "const_16_to_31_operand")])))] "TARGET_AVX512F - && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1) - && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2) - && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3) - && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1) - && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2) - && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3) - && INTVAL (operands[11]) == (INTVAL (operands[12]) - 1) - && INTVAL (operands[11]) == (INTVAL (operands[13]) - 2) - && INTVAL (operands[11]) == (INTVAL (operands[14]) - 3) - && INTVAL (operands[15]) == (INTVAL (operands[16]) - 1) - && INTVAL (operands[15]) == (INTVAL (operands[17]) - 2) - && INTVAL (operands[15]) == (INTVAL (operands[18]) - 3))" + && (INTVAL (operands[3]) & 3) == 0 + && INTVAL (operands[3]) == INTVAL (operands[4]) - 1 + && INTVAL (operands[3]) == INTVAL (operands[5]) - 2 + && INTVAL (operands[3]) == INTVAL (operands[6]) - 3 + && (INTVAL (operands[7]) & 3) == 0 + && INTVAL (operands[7]) == INTVAL (operands[8]) - 1 + && INTVAL (operands[7]) == INTVAL (operands[9]) - 2 + && INTVAL (operands[7]) == INTVAL (operands[10]) - 3 + && (INTVAL (operands[11]) & 3) == 0 + && INTVAL (operands[11]) == INTVAL (operands[12]) - 1 + && INTVAL (operands[11]) == INTVAL (operands[13]) - 2 + && INTVAL (operands[11]) == INTVAL (operands[14]) - 3 + && (INTVAL (operands[15]) & 3) == 0 + && INTVAL (operands[15]) == INTVAL (operands[16]) - 1 + && INTVAL (operands[15]) == INTVAL (operands[17]) - 2 + && INTVAL (operands[15]) == INTVAL (operands[18]) - 3" { int mask; mask = INTVAL (operands[3]) / 4; @@ -14052,6 +15089,25 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "*vec_extractv16qi_zext" + [(set (match_operand:HI 0 "register_operand" "=r,r") + (zero_extend:HI + (vec_select:QI + (match_operand:V16QI 1 "register_operand" "x,v") + (parallel + [(match_operand:SI 2 "const_0_to_15_operand")]))))] + "TARGET_SSE4_1" + "@ + %vpextrb\t{%2, %1, %k0|%k0, %1, %2} + vpextrb\t{%2, %1, %k0|%k0, %1, %2}" + [(set_attr "isa" "*,avx512bw") + (set_attr "type" "sselog1") + (set_attr "prefix_data16" "1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "TI")]) + (define_insn "*vec_extract<mode>_mem" [(set (match_operand:<ssescalarmode> 0 "register_operand" "=r") (vec_select:<ssescalarmode> @@ -14076,13 +15132,37 @@ (symbol_ref "true")))]) (define_insn "*vec_extractv2di_0_sse" - [(set (match_operand:DI 0 "nonimmediate_operand" "=v,m") + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,x ,m") (vec_select:DI - (match_operand:V2DI 1 "nonimmediate_operand" "vm,v") + (match_operand:V2DI 1 "nonimmediate_operand" " x,xm,x") (parallel [(const_int 0)])))] "TARGET_SSE && !TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "#") + "#" + [(set_attr "isa" "sse4,*,*") + (set (attr "preferred_for_speed") + (cond [(eq_attr "alternative" "0") + (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") + ] + (symbol_ref "true")))]) + +(define_split + [(set (match_operand:DI 0 "general_reg_operand") + (vec_select:DI + (match_operand:V2DI 1 "register_operand") + (parallel [(const_int 0)])))] + "TARGET_SSE4_1 && !TARGET_64BIT + && reload_completed" + [(set (match_dup 2) (match_dup 4)) + (set (match_dup 3) + (vec_select:SI + (match_dup 5) + (parallel [(const_int 1)])))] +{ + operands[4] = gen_lowpart (SImode, operands[1]); + operands[5] = gen_lowpart (V4SImode, operands[1]); + split_double_mode (DImode, &operands[0], 1, &operands[2], &operands[3]); +}) (define_split [(set (match_operand:SWI48x 0 "nonimmediate_operand") @@ -14389,6 +15469,10 @@ punpckldq\t{%2, %0|%0, %2} movd\t{%1, %0|%0, %1}" [(set_attr "isa" "noavx,noavx,avx,avx512dq,noavx,noavx,avx,*,*,*") + (set (attr "mmx_isa") + (if_then_else (eq_attr "alternative" "8,9") + (const_string "native") + (const_string "*"))) (set (attr "type") (cond [(eq_attr "alternative" "7") (const_string "ssemov") @@ -14413,22 +15497,22 @@ ;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE ;; alternatives pretty much forces the MMX alternative to be chosen. (define_insn "*vec_concatv2si" - [(set (match_operand:V2SI 0 "register_operand" "=x,x ,*y,x,x,*y,*y") + [(set (match_operand:V2SI 0 "register_operand" "=x,x ,x,x,*y,*y") (vec_concat:V2SI - (match_operand:SI 1 "nonimmediate_operand" " 0,rm,rm,0,m, 0,*rm") - (match_operand:SI 2 "reg_or_0_operand" " x,C ,C, x,C,*y,C")))] + (match_operand:SI 1 "nonimmediate_operand" " 0,rm,0,m, 0,rm") + (match_operand:SI 2 "reg_or_0_operand" " x,C ,x,C,*y,C")))] "TARGET_SSE && !TARGET_SSE4_1" "@ punpckldq\t{%2, %0|%0, %2} movd\t{%1, %0|%0, %1} - movd\t{%1, %0|%0, %1} unpcklps\t{%2, %0|%0, %2} movss\t{%1, %0|%0, %1} punpckldq\t{%2, %0|%0, %2} movd\t{%1, %0|%0, %1}" - [(set_attr "isa" "sse2,sse2,sse2,*,*,*,*") - (set_attr "type" "sselog,ssemov,mmxmov,sselog,ssemov,mmxcvt,mmxmov") - (set_attr "mode" "TI,TI,DI,V4SF,SF,DI,DI")]) + [(set_attr "isa" "sse2,sse2,*,*,*,*") + (set_attr "mmx_isa" "*,*,*,*,native,native") + (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov") + (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")]) (define_insn "*vec_concatv4si" [(set (match_operand:V4SI 0 "register_operand" "=x,v,x,x,v") @@ -14447,24 +15531,34 @@ (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) -;; movd instead of movq is required to handle broken assemblers. +(define_insn "*vec_concatv4si_0" + [(set (match_operand:V4SI 0 "register_operand" "=v,x") + (vec_concat:V4SI + (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y") + (match_operand:V2SI 2 "const0_operand" " C,C")))] + "TARGET_SSE2" + "@ + %vmovq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1}" + [(set_attr "mmx_isa" "*,native") + (set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex,orig") + (set_attr "mode" "TI")]) + (define_insn "vec_concatv2di" [(set (match_operand:V2DI 0 "register_operand" - "=Yr,*x,x ,v ,v,v ,x ,x,v ,x,x,v") + "=Yr,*x,x ,v ,x,v ,x,x,v") (vec_concat:V2DI - (match_operand:DI 1 "nonimmediate_operand" - " 0, 0,x ,Yv,r,vm,?!*y,0,Yv,0,0,v") - (match_operand:DI 2 "nonimm_or_0_operand" - " rm,rm,rm,rm,C ,C ,C ,x,Yv,x,m,m")))] + (match_operand:DI 1 "register_operand" + " 0, 0,x ,Yv,0,Yv,0,0,v") + (match_operand:DI 2 "nonimmediate_operand" + " rm,rm,rm,rm,x,Yv,x,m,m")))] "TARGET_SSE" "@ pinsrq\t{$1, %2, %0|%0, %2, 1} pinsrq\t{$1, %2, %0|%0, %2, 1} vpinsrq\t{$1, %2, %1, %0|%0, %1, %2, 1} vpinsrq\t{$1, %2, %1, %0|%0, %1, %2, 1} - * return HAVE_AS_IX86_INTERUNIT_MOVQ ? \"%vmovq\t{%1, %0|%0, %1}\" : \"%vmovd\t{%1, %0|%0, %1}\"; - %vmovq\t{%1, %0|%0, %1} - movq2dq\t{%1, %0|%0, %1} punpcklqdq\t{%2, %0|%0, %2} vpunpcklqdq\t{%2, %1, %0|%0, %1, %2} movlhps\t{%2, %0|%0, %2} @@ -14478,22 +15572,18 @@ (eq_attr "alternative" "3") (const_string "x64_avx512dq") (eq_attr "alternative" "4") - (const_string "x64_sse2") - (eq_attr "alternative" "5,6") - (const_string "sse2") - (eq_attr "alternative" "7") (const_string "sse2_noavx") - (eq_attr "alternative" "8,11") + (eq_attr "alternative" "5,8") (const_string "avx") ] (const_string "noavx"))) (set (attr "type") (if_then_else - (eq_attr "alternative" "0,1,2,3,7,8") + (eq_attr "alternative" "0,1,2,3,4,5") (const_string "sselog") (const_string "ssemov"))) (set (attr "prefix_rex") - (if_then_else (eq_attr "alternative" "0,1,2,3,4") + (if_then_else (eq_attr "alternative" "0,1,2,3") (const_string "1") (const_string "*"))) (set (attr "prefix_extra") @@ -14509,18 +15599,31 @@ (const_string "vex") (eq_attr "alternative" "3") (const_string "evex") - (eq_attr "alternative" "4,5") - (const_string "maybe_vex") - (eq_attr "alternative" "8,11") + (eq_attr "alternative" "5,8") (const_string "maybe_evex") ] (const_string "orig"))) - (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF") + (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")]) + +(define_insn "*vec_concatv2di_0" + [(set (match_operand:V2DI 0 "register_operand" "=v,v ,x") + (vec_concat:V2DI + (match_operand:DI 1 "nonimmediate_operand" " r,vm,?!*y") + (match_operand:DI 2 "const0_operand" " C,C ,C")))] + "TARGET_SSE2" + "@ + * return HAVE_AS_IX86_INTERUNIT_MOVQ ? \"%vmovq\t{%1, %0|%0, %1}\" : \"%vmovd\t{%1, %0|%0, %1}\"; + %vmovq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1}" + [(set_attr "isa" "x64,*,*") + (set_attr "mmx_isa" "*,*,native") + (set_attr "type" "ssemov") + (set_attr "prefix_rex" "1,*,*") + (set_attr "prefix" "maybe_vex,maybe_vex,orig") + (set_attr "mode" "TI") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "4") + (cond [(eq_attr "alternative" "0") (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC") - (eq_attr "alternative" "6") - (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") ] (symbol_ref "true")))]) @@ -14529,7 +15632,7 @@ [(set (match_operand:VI8_AVX_AVX512F 0 "register_operand" "=v,v") (vec_merge:VI8_AVX_AVX512F (vec_duplicate:VI8_AVX_AVX512F - (match_operand:<ssescalarmode> 2 "general_operand" "r,vm")) + (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,vm")) (match_operand:VI8_AVX_AVX512F 1 "const0_operand" "C,C") (const_int 1)))] "TARGET_AVX" @@ -14563,6 +15666,18 @@ "TARGET_SSE2" "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;") +(define_expand "vec_unpacks_sbool_lo_qi" + [(match_operand:QI 0 "register_operand") + (match_operand:QI 1 "register_operand") + (match_operand:QI 2 "const_int_operand")] + "TARGET_AVX512F" +{ + if (INTVAL (operands[2]) != 8 && INTVAL (operands[2]) != 4) + FAIL; + emit_move_insn (operands[0], operands[1]); + DONE; +}) + (define_expand "vec_unpacks_lo_hi" [(set (subreg:HI (match_operand:QI 0 "register_operand") 0) (match_operand:HI 1 "register_operand"))] @@ -14584,6 +15699,29 @@ "TARGET_SSE2" "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;") +(define_expand "vec_unpacks_sbool_hi_qi" + [(match_operand:QI 0 "register_operand") + (match_operand:QI 1 "register_operand") + (match_operand:QI 2 "const_int_operand")] + "TARGET_AVX512F" +{ + HOST_WIDE_INT nunits = INTVAL (operands[2]); + if (nunits != 8 && nunits != 4) + FAIL; + if (TARGET_AVX512DQ) + emit_insn (gen_klshiftrtqi (operands[0], operands[1], + GEN_INT (nunits / 2))); + else + { + rtx tem = gen_reg_rtx (HImode); + emit_insn (gen_klshiftrthi (tem, lowpart_subreg (HImode, operands[1], + QImode), + GEN_INT (nunits / 2))); + emit_move_insn (operands[0], lowpart_subreg (QImode, tem, HImode)); + } + DONE; +}) + (define_expand "vec_unpacks_hi_hi" [(parallel [(set (subreg:HI (match_operand:QI 0 "register_operand") 0) @@ -14622,7 +15760,7 @@ (const_int 1))))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" { - operands[<mask_expand_op3>] = CONST1_RTX(<MODE>mode); + operands[<mask_expand_op3>] = CONST1_RTX(<ssedoublemode>mode); ix86_fixup_binary_operands_no_copy (PLUS, <MODE>mode, operands); }) @@ -14636,7 +15774,7 @@ (match_operand:VI12_AVX2 1 "vector_operand" "%0,v")) (zero_extend:<ssedoublemode> (match_operand:VI12_AVX2 2 "vector_operand" "xBm,vm"))) - (match_operand:VI12_AVX2 <mask_expand_op3> "const1_operand")) + (match_operand:<ssedoublemode> <mask_expand_op3> "const1_operand")) (const_int 1))))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition> && !(MEM_P (operands[1]) && MEM_P (operands[2]))" @@ -14679,9 +15817,9 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) -(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext" +(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext" [(set (match_operand:DI 0 "register_operand" "=r") - (zero_extend:DI + (any_extend:DI (unspec:SI [(match_operand:VF_128_256 1 "register_operand" "x")] UNSPEC_MOVMSK)))] @@ -14691,6 +15829,78 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "<MODE>")]) +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_lt" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(lt:VF_128_256 + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:<sseintvecmode> 2 "const0_operand" "C"))] + UNSPEC_MOVMSK))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext_lt" + [(set (match_operand:DI 0 "register_operand" "=r") + (any_extend:DI + (unspec:SI + [(lt:VF_128_256 + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:<sseintvecmode> 2 "const0_operand" "C"))] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_shift" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(subreg:VF_128_256 + (ashiftrt:<sseintvecmode> + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:QI 2 "const_int_operand" "n")) 0)] + UNSPEC_MOVMSK))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + +(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext_shift" + [(set (match_operand:DI 0 "register_operand" "=r") + (any_extend:DI + (unspec:SI + [(subreg:VF_128_256 + (ashiftrt:<sseintvecmode> + (match_operand:<sseintvecmode> 1 "register_operand" "x") + (match_operand:QI 2 "const_int_operand" "n")) 0)] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE" + "#" + "&& reload_completed" + [(set (match_dup 0) + (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" + [(set_attr "type" "ssemov") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "<MODE>")]) + (define_insn "<sse2_avx2>_pmovmskb" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI @@ -14724,6 +15934,88 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "SI")]) +(define_insn "*sse2_pmovmskb_ext" + [(set (match_operand:DI 0 "register_operand" "=r") + (sign_extend:DI + (unspec:SI + [(match_operand:V16QI 1 "register_operand" "x")] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE2" + "%vpmovmskb\t{%1, %k0|%k0, %1}" + [(set_attr "type" "ssemov") + (set (attr "prefix_data16") + (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*<sse2_avx2>_pmovmskb_lt" + [(set (match_operand:SI 0 "register_operand" "=r") + (unspec:SI + [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x") + (match_operand:VI1_AVX2 2 "const0_operand" "C"))] + UNSPEC_MOVMSK))] + "TARGET_SSE2" + "#" + "" + [(set (match_dup 0) + (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))] + "" + [(set_attr "type" "ssemov") + (set (attr "prefix_data16") + (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*<sse2_avx2>_pmovmskb_zext_lt" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (unspec:SI + [(lt:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand" "x") + (match_operand:VI1_AVX2 2 "const0_operand" "C"))] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE2" + "#" + "" + [(set (match_dup 0) + (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "" + [(set_attr "type" "ssemov") + (set (attr "prefix_data16") + (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + +(define_insn_and_split "*sse2_pmovmskb_ext_lt" + [(set (match_operand:DI 0 "register_operand" "=r") + (sign_extend:DI + (unspec:SI + [(lt:V16QI (match_operand:V16QI 1 "register_operand" "x") + (match_operand:V16QI 2 "const0_operand" "C"))] + UNSPEC_MOVMSK)))] + "TARGET_64BIT && TARGET_SSE2" + "#" + "" + [(set (match_dup 0) + (sign_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] + "" + [(set_attr "type" "ssemov") + (set (attr "prefix_data16") + (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "SI")]) + (define_expand "sse2_maskmovdqu" [(set (match_operand:V16QI 0 "memory_operand") (unspec:V16QI [(match_operand:V16QI 1 "register_operand") @@ -14800,7 +16092,7 @@ "mwait" [(set_attr "length" "3")]) -(define_insn "sse3_monitor_<mode>" +(define_insn "@sse3_monitor_<mode>" [(unspec_volatile [(match_operand:P 0 "register_operand" "a") (match_operand:SI 1 "register_operand" "c") (match_operand:SI 2 "register_operand" "d")] @@ -14946,13 +16238,13 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn "ssse3_ph<plusminus_mnemonic>wv4hi3" - [(set (match_operand:V4HI 0 "register_operand" "=y") +(define_insn_and_split "ssse3_ph<plusminus_mnemonic>wv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") (vec_concat:V4HI (vec_concat:V2HI (ssse3_plusminus:HI (vec_select:HI - (match_operand:V4HI 1 "register_operand" "0") + (match_operand:V4HI 1 "register_operand" "0,0,Yv") (parallel [(const_int 0)])) (vec_select:HI (match_dup 1) (parallel [(const_int 1)]))) (ssse3_plusminus:HI @@ -14961,19 +16253,38 @@ (vec_concat:V2HI (ssse3_plusminus:HI (vec_select:HI - (match_operand:V4HI 2 "nonimmediate_operand" "ym") + (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yv") (parallel [(const_int 0)])) (vec_select:HI (match_dup 2) (parallel [(const_int 1)]))) (ssse3_plusminus:HI (vec_select:HI (match_dup 2) (parallel [(const_int 2)])) (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))] - "TARGET_SSSE3" - "ph<plusminus_mnemonic>w\t{%2, %0|%0, %2}" - [(set_attr "type" "sseiadd") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + ph<plusminus_mnemonic>w\t{%2, %0|%0, %2} + # + #" + "TARGET_SSSE3 && reload_completed + && SSE_REGNO_P (REGNO (operands[0]))" + [(const_int 0)] +{ + /* Generate SSE version of the operation. */ + rtx op0 = lowpart_subreg (V8HImode, operands[0], + GET_MODE (operands[0])); + rtx op1 = lowpart_subreg (V8HImode, operands[1], + GET_MODE (operands[1])); + rtx op2 = lowpart_subreg (V8HImode, operands[2], + GET_MODE (operands[2])); + emit_insn (gen_ssse3_ph<plusminus_mnemonic>wv8hi3 (op0, op1, op2)); + ix86_move_vector_high_sse_to_mmx (op0); + DONE; +} + [(set_attr "mmx_isa" "native,sse_noavx,avx") + (set_attr "type" "sseiadd") (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) (define_insn "avx2_ph<plusminus_mnemonic>dv8si3" [(set (match_operand:V8SI 0 "register_operand" "=x") @@ -15052,26 +16363,45 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) -(define_insn "ssse3_ph<plusminus_mnemonic>dv2si3" - [(set (match_operand:V2SI 0 "register_operand" "=y") +(define_insn_and_split "ssse3_ph<plusminus_mnemonic>dv2si3" + [(set (match_operand:V2SI 0 "register_operand" "=y,x,Yv") (vec_concat:V2SI (plusminus:SI (vec_select:SI - (match_operand:V2SI 1 "register_operand" "0") + (match_operand:V2SI 1 "register_operand" "0,0,Yv") (parallel [(const_int 0)])) (vec_select:SI (match_dup 1) (parallel [(const_int 1)]))) (plusminus:SI (vec_select:SI - (match_operand:V2SI 2 "nonimmediate_operand" "ym") + (match_operand:V2SI 2 "register_mmxmem_operand" "ym,x,Yv") (parallel [(const_int 0)])) (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))))] - "TARGET_SSSE3" - "ph<plusminus_mnemonic>d\t{%2, %0|%0, %2}" - [(set_attr "type" "sseiadd") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + ph<plusminus_mnemonic>d\t{%2, %0|%0, %2} + # + #" + "TARGET_SSSE3 && reload_completed + && SSE_REGNO_P (REGNO (operands[0]))" + [(const_int 0)] +{ + /* Generate SSE version of the operation. */ + rtx op0 = lowpart_subreg (V4SImode, operands[0], + GET_MODE (operands[0])); + rtx op1 = lowpart_subreg (V4SImode, operands[1], + GET_MODE (operands[1])); + rtx op2 = lowpart_subreg (V4SImode, operands[2], + GET_MODE (operands[2])); + emit_insn (gen_ssse3_ph<plusminus_mnemonic>dv4si3 (op0, op1, op2)); + ix86_move_vector_high_sse_to_mmx (op0); + DONE; +} + [(set_attr "mmx_isa" "native,sse_noavx,avx") + (set_attr "type" "sseiadd") (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) (define_insn "avx2_pmaddubsw256" [(set (match_operand:V16HI 0 "register_operand" "=x,v") @@ -15222,17 +16552,17 @@ (set_attr "mode" "TI")]) (define_insn "ssse3_pmaddubsw" - [(set (match_operand:V4HI 0 "register_operand" "=y") + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") (ss_plus:V4HI (mult:V4HI (zero_extend:V4HI (vec_select:V4QI - (match_operand:V8QI 1 "register_operand" "0") + (match_operand:V8QI 1 "register_operand" "0,0,Yv") (parallel [(const_int 0) (const_int 2) (const_int 4) (const_int 6)]))) (sign_extend:V4HI (vec_select:V4QI - (match_operand:V8QI 2 "nonimmediate_operand" "ym") + (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv") (parallel [(const_int 0) (const_int 2) (const_int 4) (const_int 6)])))) (mult:V4HI @@ -15244,16 +16574,21 @@ (vec_select:V4QI (match_dup 2) (parallel [(const_int 1) (const_int 3) (const_int 5) (const_int 7)]))))))] - "TARGET_SSSE3" - "pmaddubsw\t{%2, %0|%0, %2}" - [(set_attr "type" "sseiadd") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + pmaddubsw\t{%2, %0|%0, %2} + pmaddubsw\t{%2, %0|%0, %2} + vpmaddubsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,noavx,avx") + (set_attr "mmx_isa" "native,*,*") + (set_attr "type" "sseiadd") (set_attr "atom_unit" "simul") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) (define_mode_iterator PMULHRSW - [V4HI V8HI (V16HI "TARGET_AVX2")]) + [V8HI (V16HI "TARGET_AVX2")]) (define_expand "<ssse3_avx2>_pmulhrsw<mode>3_mask" [(set (match_operand:PMULHRSW 0 "register_operand") @@ -15292,7 +16627,27 @@ (const_int 14)) (match_dup 3)) (const_int 1))))] - "TARGET_AVX2" + "TARGET_SSSE3" +{ + operands[3] = CONST1_RTX(<MODE>mode); + ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); +}) + +(define_expand "smulhrs<mode>3" + [(set (match_operand:VI2_AVX2 0 "register_operand") + (truncate:VI2_AVX2 + (lshiftrt:<ssedoublemode> + (plus:<ssedoublemode> + (lshiftrt:<ssedoublemode> + (mult:<ssedoublemode> + (sign_extend:<ssedoublemode> + (match_operand:VI2_AVX2 1 "nonimmediate_operand")) + (sign_extend:<ssedoublemode> + (match_operand:VI2_AVX2 2 "nonimmediate_operand"))) + (const_int 14)) + (match_dup 3)) + (const_int 1))))] + "TARGET_SSSE3" { operands[3] = CONST1_RTX(<MODE>mode); ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands); @@ -15325,26 +16680,73 @@ (set_attr "prefix" "orig,maybe_evex,evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "*ssse3_pmulhrswv4hi3" - [(set (match_operand:V4HI 0 "register_operand" "=y") +(define_expand "smulhrsv4hi3" + [(set (match_operand:V4HI 0 "register_operand") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (lshiftrt:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "register_operand")) + (sign_extend:V4SI + (match_operand:V4HI 2 "register_operand"))) + (const_int 14)) + (match_dup 3)) + (const_int 1))))] + "TARGET_MMX_WITH_SSE && TARGET_SSSE3" +{ + operands[3] = CONST1_RTX(V4HImode); + ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands); +}) + +(define_expand "ssse3_pmulhrswv4hi3" + [(set (match_operand:V4HI 0 "register_operand") (truncate:V4HI (lshiftrt:V4SI (plus:V4SI (lshiftrt:V4SI (mult:V4SI (sign_extend:V4SI - (match_operand:V4HI 1 "nonimmediate_operand" "%0")) + (match_operand:V4HI 1 "register_mmxmem_operand")) (sign_extend:V4SI - (match_operand:V4HI 2 "nonimmediate_operand" "ym"))) + (match_operand:V4HI 2 "register_mmxmem_operand"))) + (const_int 14)) + (match_dup 3)) + (const_int 1))))] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" +{ + operands[3] = CONST1_RTX(V4HImode); + ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands); +}) + +(define_insn "*ssse3_pmulhrswv4hi3" + [(set (match_operand:V4HI 0 "register_operand" "=y,x,Yv") + (truncate:V4HI + (lshiftrt:V4SI + (plus:V4SI + (lshiftrt:V4SI + (mult:V4SI + (sign_extend:V4SI + (match_operand:V4HI 1 "register_mmxmem_operand" "%0,0,Yv")) + (sign_extend:V4SI + (match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yv"))) (const_int 14)) (match_operand:V4HI 3 "const1_operand")) (const_int 1))))] - "TARGET_SSSE3 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" - "pmulhrsw\t{%2, %0|%0, %2}" - [(set_attr "type" "sseimul") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) + && TARGET_SSSE3 + && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + "@ + pmulhrsw\t{%2, %0|%0, %2} + pmulhrsw\t{%2, %0|%0, %2} + vpmulhrsw\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,noavx,avx") + (set_attr "mmx_isa" "native,*,*") + (set_attr "type" "sseimul") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) (define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>" [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x,v") @@ -15365,17 +16767,46 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "ssse3_pshufbv8qi3" - [(set (match_operand:V8QI 0 "register_operand" "=y") - (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0") - (match_operand:V8QI 2 "nonimmediate_operand" "ym")] - UNSPEC_PSHUFB))] - "TARGET_SSSE3" - "pshufb\t{%2, %0|%0, %2}"; - [(set_attr "type" "sselog1") +(define_insn_and_split "ssse3_pshufbv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yv") + (unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0,0,Yv") + (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yv")] + UNSPEC_PSHUFB)) + (clobber (match_scratch:V4SI 3 "=X,x,Yv"))] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + pshufb\t{%2, %0|%0, %2} + # + #" + "TARGET_SSSE3 && reload_completed + && SSE_REGNO_P (REGNO (operands[0]))" + [(set (match_dup 3) (match_dup 5)) + (set (match_dup 3) + (and:V4SI (match_dup 3) (match_dup 2))) + (set (match_dup 0) + (unspec:V16QI [(match_dup 1) (match_dup 4)] UNSPEC_PSHUFB))] +{ + /* Emulate MMX version of pshufb with SSE version by masking out the + bit 3 of the shuffle control byte. */ + operands[0] = lowpart_subreg (V16QImode, operands[0], + GET_MODE (operands[0])); + operands[1] = lowpart_subreg (V16QImode, operands[1], + GET_MODE (operands[1])); + operands[2] = lowpart_subreg (V4SImode, operands[2], + GET_MODE (operands[2])); + operands[4] = lowpart_subreg (V16QImode, operands[3], + GET_MODE (operands[3])); + rtvec par = gen_rtvec (4, GEN_INT (0xf7f7f7f7), + GEN_INT (0xf7f7f7f7), + GEN_INT (0xf7f7f7f7), + GEN_INT (0xf7f7f7f7)); + rtx vec_const = gen_rtx_CONST_VECTOR (V4SImode, par); + operands[5] = force_const_mem (V4SImode, vec_const); +} + [(set_attr "mmx_isa" "native,sse_noavx,avx") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) (define_insn "<ssse3_avx2>_psign<mode>3" [(set (match_operand:VI124_AVX2 0 "register_operand" "=x,x") @@ -15395,17 +16826,22 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "ssse3_psign<mode>3" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y") + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,Yv") (unspec:MMXMODEI - [(match_operand:MMXMODEI 1 "register_operand" "0") - (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")] + [(match_operand:MMXMODEI 1 "register_operand" "0,0,Yv") + (match_operand:MMXMODEI 2 "register_mmxmem_operand" "ym,x,Yv")] UNSPEC_PSIGN))] - "TARGET_SSSE3" - "psign<mmxvecsize>\t{%2, %0|%0, %2}"; - [(set_attr "type" "sselog1") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + psign<mmxvecsize>\t{%2, %0|%0, %2} + psign<mmxvecsize>\t{%2, %0|%0, %2} + vpsign<mmxvecsize>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "*,noavx,avx") + (set_attr "mmx_isa" "native,*,*") + (set_attr "type" "sselog1") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) (define_insn "<ssse3_avx2>_palignr<mode>_mask" [(set (match_operand:VI1_AVX512 0 "register_operand" "=v") @@ -15460,23 +16896,59 @@ (set_attr "prefix" "orig,vex,evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "ssse3_palignrdi" - [(set (match_operand:DI 0 "register_operand" "=y") - (unspec:DI [(match_operand:DI 1 "register_operand" "0") - (match_operand:DI 2 "nonimmediate_operand" "ym") - (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")] +(define_insn_and_split "ssse3_palignrdi" + [(set (match_operand:DI 0 "register_operand" "=y,x,Yv") + (unspec:DI [(match_operand:DI 1 "register_operand" "0,0,Yv") + (match_operand:DI 2 "register_mmxmem_operand" "ym,x,Yv") + (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n,n")] UNSPEC_PALIGNR))] - "TARGET_SSSE3" -{ - operands[3] = GEN_INT (INTVAL (operands[3]) / 8); - return "palignr\t{%3, %2, %0|%0, %2, %3}"; -} - [(set_attr "type" "sseishft") + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" +{ + switch (which_alternative) + { + case 0: + operands[3] = GEN_INT (INTVAL (operands[3]) / 8); + return "palignr\t{%3, %2, %0|%0, %2, %3}"; + case 1: + case 2: + return "#"; + default: + gcc_unreachable (); + } +} + "TARGET_SSSE3 && reload_completed + && SSE_REGNO_P (REGNO (operands[0]))" + [(set (match_dup 0) + (lshiftrt:V1TI (match_dup 0) (match_dup 3)))] +{ + /* Emulate MMX palignrdi with SSE psrldq. */ + rtx op0 = lowpart_subreg (V2DImode, operands[0], + GET_MODE (operands[0])); + if (TARGET_AVX) + emit_insn (gen_vec_concatv2di (op0, operands[2], operands[1])); + else + { + /* NB: SSE can only concatenate OP0 and OP1 to OP0. */ + emit_insn (gen_vec_concatv2di (op0, operands[1], operands[2])); + /* Swap bits 0:63 with bits 64:127. */ + rtx mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), + GEN_INT (3), + GEN_INT (0), + GEN_INT (1))); + rtx op1 = lowpart_subreg (V4SImode, op0, GET_MODE (op0)); + rtx op2 = gen_rtx_VEC_SELECT (V4SImode, op1, mask); + emit_insn (gen_rtx_SET (op1, op2)); + } + operands[0] = lowpart_subreg (V1TImode, op0, GET_MODE (op0)); +} + [(set_attr "mmx_isa" "native,sse_noavx,avx") + (set_attr "type" "sseishft") (set_attr "atom_unit" "sishuf") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI,TI")]) ;; Mode iterator to handle singularity w/ absence of V2DI and V4DI ;; modes for abs instruction on pre AVX-512 targets. @@ -15539,17 +17011,26 @@ } }) -(define_insn "abs<mode>2" - [(set (match_operand:MMXMODEI 0 "register_operand" "=y") +(define_insn "ssse3_abs<mode>2" + [(set (match_operand:MMXMODEI 0 "register_operand" "=y,Yv") (abs:MMXMODEI - (match_operand:MMXMODEI 1 "nonimmediate_operand" "ym")))] - "TARGET_SSSE3" - "pabs<mmxvecsize>\t{%1, %0|%0, %1}"; - [(set_attr "type" "sselog1") + (match_operand:MMXMODEI 1 "register_mmxmem_operand" "ym,Yv")))] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && TARGET_SSSE3" + "@ + pabs<mmxvecsize>\t{%1, %0|%0, %1} + %vpabs<mmxvecsize>\t{%1, %0|%0, %1}" + [(set_attr "mmx_isa" "native,*") + (set_attr "type" "sselog1") (set_attr "prefix_rep" "0") (set_attr "prefix_extra" "1") (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) - (set_attr "mode" "DI")]) + (set_attr "mode" "DI,TI")]) + +(define_insn "abs<mode>2" + [(set (match_operand:MMXMODEI 0 "register_operand") + (abs:MMXMODEI + (match_operand:MMXMODEI 1 "register_operand")))] + "TARGET_MMX_WITH_SSE && TARGET_SSSE3") ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -15680,6 +17161,108 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<MODE>")]) +;; Also define scalar versions. These are used for conditional move. +;; Using subregs into vector modes causes register allocation lossage. +;; These patterns do not allow memory operands because the native +;; instructions read the full 128-bits. + +(define_insn "sse4_1_blendv<ssemodesuffix>" + [(set (match_operand:MODEF 0 "register_operand" "=Yr,*x,x") + (unspec:MODEF + [(match_operand:MODEF 1 "register_operand" "0,0,x") + (match_operand:MODEF 2 "register_operand" "Yr,*x,x") + (match_operand:MODEF 3 "register_operand" "Yz,Yz,x")] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" +{ + if (get_attr_mode (insn) == MODE_V4SF) + return (which_alternative == 2 + ? "vblendvps\t{%3, %2, %1, %0|%0, %1, %2, %3}" + : "blendvps\t{%3, %2, %0|%0, %2, %3}"); + else + return (which_alternative == 2 + ? "vblendv<ssevecmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + : "blendv<ssevecmodesuffix>\t{%3, %2, %0|%0, %2, %3}"); +} + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set (attr "mode") + (cond [(match_test "TARGET_AVX") + (const_string "<ssevecmode>") + (match_test "optimize_function_for_size_p (cfun)") + (const_string "V4SF") + (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL") + (const_string "V4SF") + ] + (const_string "<ssevecmode>")))]) + +(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt" + [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") + (unspec:VF_128_256 + [(match_operand:VF_128_256 1 "register_operand" "0,0,x") + (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm") + (lt:VF_128_256 + (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x") + (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:VF_128_256 + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] + "operands[3] = gen_lowpart (<MODE>mode, operands[3]);" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<MODE>")]) + +(define_mode_attr ssefltmodesuffix + [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")]) + +(define_mode_attr ssefltvecmode + [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")]) + +(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint" + [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x") + (unspec:<ssebytemode> + [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x") + (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm") + (subreg:<ssebytemode> + (lt:VI48_AVX + (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x") + (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (unspec:<ssefltvecmode> + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] +{ + operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]); + operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]); + operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]); + operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]); +} + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<ssefltvecmode>")]) + (define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 @@ -15778,6 +17361,29 @@ (set_attr "btver2_decode" "vector,vector,vector") (set_attr "mode" "<sseinsnmode>")]) +(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x") + (unspec:VI1_AVX2 + [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x") + (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm") + (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x") + (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))] + UNSPEC_BLENDV))] + "TARGET_SSE4_1" + "#" + "" + [(set (match_dup 0) + (unspec:VI1_AVX2 + [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))] + "" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "*,*,1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "btver2_decode" "vector,vector,vector") + (set_attr "mode" "<sseinsnmode>")]) + (define_insn "sse4_1_pblendw" [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x") (vec_merge:V8HI @@ -15878,19 +17484,51 @@ [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") (any_extend:V8HI (vec_select:V8QI - (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm") + (match_operand:V16QI 1 "register_operand" "Yr,*x,v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3) (const_int 4) (const_int 5) (const_int 6) (const_int 7)]))))] "TARGET_SSE4_1 && <mask_avx512bw_condition> && <mask_avx512vl_condition>" - "%vpmov<extsuffix>bw\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + "%vpmov<extsuffix>bw\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_<code>v8qiv8hi2<mask_name>_1" + [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") + (any_extend:V8HI + (match_operand:V8QI 1 "memory_operand" "m,m,m")))] + "TARGET_SSE4_1 && <mask_avx512bw_condition> && <mask_avx512vl_condition>" + "%vpmov<extsuffix>bw\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse4_1_<code>v8qiv8hi2<mask_name>_2" + [(set (match_operand:V8HI 0 "register_operand") + (any_extend:V8HI + (vec_select:V8QI + (subreg:V16QI + (vec_concat:V2DI + (match_operand:DI 1 "memory_operand") + (const_int 0)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_SSE4_1 && <mask_avx512bw_condition> && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V8HI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") + (define_insn "<mask_codefor>avx512f_<code>v16qiv16si2<mask_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (any_extend:V16SI @@ -15905,33 +17543,98 @@ [(set (match_operand:V8SI 0 "register_operand" "=v") (any_extend:V8SI (vec_select:V8QI - (match_operand:V16QI 1 "nonimmediate_operand" "vm") + (match_operand:V16QI 1 "register_operand" "v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3) (const_int 4) (const_int 5) (const_int 6) (const_int 7)]))))] "TARGET_AVX2 && <mask_avx512vl_condition>" - "vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + "vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "OI")]) + +(define_insn "*avx2_<code>v8qiv8si2<mask_name>_1" + [(set (match_operand:V8SI 0 "register_operand" "=v") + (any_extend:V8SI + (match_operand:V8QI 1 "memory_operand" "m")))] + "TARGET_AVX2 && <mask_avx512vl_condition>" + "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx2_<code>v8qiv8si2<mask_name>_2" + [(set (match_operand:V8SI 0 "register_operand") + (any_extend:V8SI + (vec_select:V8QI + (subreg:V16QI + (vec_concat:V2DI + (match_operand:DI 1 "memory_operand") + (const_int 0)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX2 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V8SI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") + (define_insn "sse4_1_<code>v4qiv4si2<mask_name>" [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI (vec_select:V4QI - (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm") + (match_operand:V16QI 1 "register_operand" "Yr,*x,v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] "TARGET_SSE4_1 && <mask_avx512vl_condition>" - "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}" + "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_<code>v4qiv4si2<mask_name>_1" + [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") + (any_extend:V4SI + (match_operand:V4QI 1 "memory_operand" "m,m,m")))] + "TARGET_SSE4_1 && <mask_avx512vl_condition>" + "%vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse4_1_<code>v4qiv4si2<mask_name>_2" + [(set (match_operand:V4SI 0 "register_operand") + (any_extend:V4SI + (vec_select:V4QI + (subreg:V16QI + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 1 "memory_operand")) + (const_vector:V4SI + [(const_int 0) (const_int 0) + (const_int 0) (const_int 0)]) + (const_int 1)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_SSE4_1 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V4SI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V4QImode, 0);") + (define_insn "avx512f_<code>v16hiv16si2<mask_name>" [(set (match_operand:V16SI 0 "register_operand" "=v") (any_extend:V16SI @@ -15957,54 +17660,146 @@ [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") (any_extend:V4SI (vec_select:V4HI - (match_operand:V8HI 1 "nonimmediate_operand" "Yrm,*xm,vm") + (match_operand:V8HI 1 "register_operand" "Yr,*x,v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] "TARGET_SSE4_1 && <mask_avx512vl_condition>" - "%vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + "%vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_insn "*sse4_1_<code>v4hiv4si2<mask_name>_1" + [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") + (any_extend:V4SI + (match_operand:V4HI 1 "memory_operand" "m,m,m")))] + "TARGET_SSE4_1 && <mask_avx512vl_condition>" + "%vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn_and_split "*sse4_1_<code>v4hiv4si2<mask_name>_2" + [(set (match_operand:V4SI 0 "register_operand") + (any_extend:V4SI + (vec_select:V4HI + (subreg:V8HI + (vec_concat:V2DI + (match_operand:DI 1 "memory_operand") + (const_int 0)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_SSE4_1 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V4SI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V4HImode, 0);") + (define_insn "avx512f_<code>v8qiv8di2<mask_name>" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI (vec_select:V8QI - (match_operand:V16QI 1 "nonimmediate_operand" "vm") + (match_operand:V16QI 1 "register_operand" "v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3) (const_int 4) (const_int 5) (const_int 6) (const_int 7)]))))] "TARGET_AVX512F" - "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}" + "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_insn "*avx512f_<code>v8qiv8di2<mask_name>_1" + [(set (match_operand:V8DI 0 "register_operand" "=v") + (any_extend:V8DI + (match_operand:V8QI 1 "memory_operand" "m")))] + "TARGET_AVX512F" + "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "mode" "XI")]) +(define_insn_and_split "*avx512f_<code>v8qiv8di2<mask_name>_2" + [(set (match_operand:V8DI 0 "register_operand") + (any_extend:V8DI + (vec_select:V8QI + (subreg:V16QI + (vec_concat:V2DI + (match_operand:DI 1 "memory_operand") + (const_int 0)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] + "TARGET_AVX512F && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V8DI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") + (define_insn "avx2_<code>v4qiv4di2<mask_name>" [(set (match_operand:V4DI 0 "register_operand" "=v") (any_extend:V4DI (vec_select:V4QI - (match_operand:V16QI 1 "nonimmediate_operand" "vm") + (match_operand:V16QI 1 "register_operand" "v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] "TARGET_AVX2 && <mask_avx512vl_condition>" - "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}" + "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "OI")]) + +(define_insn "*avx2_<code>v4qiv4di2<mask_name>_1" + [(set (match_operand:V4DI 0 "register_operand" "=v") + (any_extend:V4DI + (match_operand:V4QI 1 "memory_operand" "m")))] + "TARGET_AVX2 && <mask_avx512vl_condition>" + "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx2_<code>v4qiv4di2<mask_name>_2" + [(set (match_operand:V4DI 0 "register_operand") + (any_extend:V4DI + (vec_select:V4QI + (subreg:V16QI + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 1 "memory_operand")) + (const_vector:V4SI + [(const_int 0) (const_int 0) + (const_int 0) (const_int 0)]) + (const_int 1)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_AVX2 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V4DI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V4QImode, 0);") + (define_insn "sse4_1_<code>v2qiv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") (any_extend:V2DI (vec_select:V2QI - (match_operand:V16QI 1 "nonimmediate_operand" "Yrm,*xm,vm") + (match_operand:V16QI 1 "register_operand" "Yr,*x,v") (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE4_1 && <mask_avx512vl_condition>" - "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %w1}" + "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") @@ -16025,30 +17820,92 @@ [(set (match_operand:V4DI 0 "register_operand" "=v") (any_extend:V4DI (vec_select:V4HI - (match_operand:V8HI 1 "nonimmediate_operand" "vm") + (match_operand:V8HI 1 "register_operand" "v") (parallel [(const_int 0) (const_int 1) (const_int 2) (const_int 3)]))))] "TARGET_AVX2 && <mask_avx512vl_condition>" - "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "OI")]) + +(define_insn "*avx2_<code>v4hiv4di2<mask_name>_1" + [(set (match_operand:V4DI 0 "register_operand" "=v") + (any_extend:V4DI + (match_operand:V4HI 1 "memory_operand" "m")))] + "TARGET_AVX2 && <mask_avx512vl_condition>" + "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_evex") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx2_<code>v4hiv4di2<mask_name>_2" + [(set (match_operand:V4DI 0 "register_operand") + (any_extend:V4DI + (vec_select:V4HI + (subreg:V8HI + (vec_concat:V2DI + (match_operand:DI 1 "memory_operand") + (const_int 0)) 0) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] + "TARGET_AVX2 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V4DI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V4HImode, 0);") + (define_insn "sse4_1_<code>v2hiv2di2<mask_name>" [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") (any_extend:V2DI (vec_select:V2HI - (match_operand:V8HI 1 "nonimmediate_operand" "Yrm,*xm,vm") + (match_operand:V8HI 1 "register_operand" "Yr,*x,v") (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE4_1 && <mask_avx512vl_condition>" - "%vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}" + "%vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn "*sse4_1_<code>v2hiv2di2<mask_name>_1" + [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") + (any_extend:V2DI + (match_operand:V2HI 1 "memory_operand" "m,m,m")))] + "TARGET_SSE4_1 && <mask_avx512vl_condition>" + "%vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse4_1_<code>v2hiv2di2<mask_name>_2" + [(set (match_operand:V2DI 0 "register_operand") + (any_extend:V2DI + (vec_select:V2HI + (subreg:V8HI + (vec_merge:V4SI + (vec_duplicate:V4SI + (match_operand:SI 1 "memory_operand")) + (const_vector:V4SI + [(const_int 0) (const_int 0) + (const_int 0) (const_int 0)]) + (const_int 1)) 0) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE4_1 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V2DI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V2HImode, 0);") + (define_insn "avx512f_<code>v8siv8di2<mask_name>" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -16074,16 +17931,45 @@ [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") (any_extend:V2DI (vec_select:V2SI - (match_operand:V4SI 1 "nonimmediate_operand" "Yrm,*xm,vm") + (match_operand:V4SI 1 "register_operand" "Yr,*x,v") (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE4_1 && <mask_avx512vl_condition>" - "%vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}" + "%vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,maybe_evex") (set_attr "mode" "TI")]) +(define_insn "*sse4_1_<code>v2siv2di2<mask_name>_1" + [(set (match_operand:V2DI 0 "register_operand" "=Yr,*x,v") + (any_extend:V2DI + (match_operand:V2SI 1 "memory_operand" "m,m,m")))] + "TARGET_SSE4_1 && <mask_avx512vl_condition>" + "%vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,maybe_evex") + (set_attr "mode" "TI")]) + +(define_insn_and_split "*sse4_1_<code>v2siv2di2<mask_name>_2" + [(set (match_operand:V2DI 0 "register_operand") + (any_extend:V2DI + (vec_select:V2SI + (subreg:V4SI + (vec_concat:V2DI + (match_operand:DI 1 "memory_operand") + (const_int 0)) 0) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE4_1 && <mask_avx512vl_condition> + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (any_extend:V2DI (match_dup 1)))] + "operands[1] = adjust_address_nv (operands[1], V2SImode, 0);") + ;; ptestps/ptestpd are very similar to comiss and ucomiss when ;; setting FLAGS_REG. But it is not a really compare instruction. (define_insn "avx_vtest<ssemodesuffix><avxsizesuffix>" @@ -16131,6 +18017,24 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) +(define_expand "nearbyint<mode>2" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "vector_operand") + (match_dup 2)] + UNSPEC_ROUND))] + "TARGET_SSE4_1" + "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);") + +(define_expand "rint<mode>2" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "vector_operand") + (match_dup 2)] + UNSPEC_ROUND))] + "TARGET_SSE4_1" + "operands[2] = GEN_INT (ROUND_MXCSR);") + (define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>" [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x") (unspec:VF_128_256 @@ -16227,13 +18131,37 @@ [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 2 "register_operand" "Yr,*x,x,v") + [(match_operand:VF_128 2 "nonimmediate_operand" "Yrm,*xm,xm,vm") (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")] UNSPEC_ROUND) (match_operand:VF_128 1 "register_operand" "0,0,x,v") (const_int 1)))] "TARGET_SSE4_1" "@ + round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %3} + round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %<iptr>2, %3} + vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %3} + vrndscale<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %3}" + [(set_attr "isa" "noavx,noavx,avx,avx512f") + (set_attr "type" "ssecvt") + (set_attr "length_immediate" "1") + (set_attr "prefix_data16" "1,1,*,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,orig,vex,evex") + (set_attr "mode" "<MODE>")]) + +(define_insn "*sse4_1_round<ssescalarmodesuffix>" + [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v") + (vec_merge:VF_128 + (vec_duplicate:VF_128 + (unspec:<ssescalarmode> + [(match_operand:<ssescalarmode> 2 "nonimmediate_operand" "Yrm,*xm,xm,vm") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")] + UNSPEC_ROUND)) + (match_operand:VF_128 1 "register_operand" "0,0,x,v") + (const_int 1)))] + "TARGET_SSE4_1" + "@ round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3} round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3} vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} @@ -16360,7 +18288,7 @@ (match_dup 6)] UNSPEC_PCMPESTR))] "TARGET_SSE4_2 - && can_create_pseudo_p ()" + && ix86_pre_reload_split ()" "#" "&& 1" [(const_int 0)] @@ -16496,7 +18424,7 @@ (match_dup 4)] UNSPEC_PCMPISTR))] "TARGET_SSE4_2 - && can_create_pseudo_p ()" + && ix86_pre_reload_split ()" "#" "&& 1" [(const_int 0)] @@ -16620,7 +18548,7 @@ operands[3]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512pf_gatherpf<mode>sf_mask" +(define_insn "*avx512pf_gatherpf<VI48_512:mode>sf_mask" [(unspec [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk") (match_operator:<GATHER_SCATTER_SF_MEM_MODE> 5 "vsib_mem_operator" @@ -16636,9 +18564,11 @@ switch (INTVAL (operands[4])) { case 3: - return "vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as + gas changed what it requires incompatibly. */ + return "%M2vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}"; case 2: - return "vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + return "%M2vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}"; default: gcc_unreachable (); } @@ -16665,7 +18595,7 @@ operands[3]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512pf_gatherpf<mode>df_mask" +(define_insn "*avx512pf_gatherpf<VI4_256_8_512:mode>df_mask" [(unspec [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk") (match_operator:V8DF 5 "vsib_mem_operator" @@ -16681,9 +18611,11 @@ switch (INTVAL (operands[4])) { case 3: - return "vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}"; + /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as + gas changed what it requires incompatibly. */ + return "%M2vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}"; case 2: - return "vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}"; + return "%M2vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}"; default: gcc_unreachable (); } @@ -16710,7 +18642,7 @@ operands[3]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512pf_scatterpf<mode>sf_mask" +(define_insn "*avx512pf_scatterpf<VI48_512:mode>sf_mask" [(unspec [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk") (match_operator:<GATHER_SCATTER_SF_MEM_MODE> 5 "vsib_mem_operator" @@ -16727,10 +18659,12 @@ { case 3: case 7: - return "vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as + gas changed what it requires incompatibly. */ + return "%M2vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}"; case 2: case 6: - return "vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}"; + return "%M2vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%X5%{%0%}}"; default: gcc_unreachable (); } @@ -16757,7 +18691,7 @@ operands[3]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512pf_scatterpf<mode>df_mask" +(define_insn "*avx512pf_scatterpf<VI4_256_8_512:mode>df_mask" [(unspec [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk") (match_operator:V8DF 5 "vsib_mem_operator" @@ -16774,10 +18708,12 @@ { case 3: case 7: - return "vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}"; + /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as + gas changed what it requires incompatibly. */ + return "%M2vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}"; case 2: case 6: - return "vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}"; + return "%M2vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%X5%{%0%}}"; default: gcc_unreachable (); } @@ -16812,7 +18748,7 @@ [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")] + [(match_operand:VF_128 1 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>")] UNSPEC_RCP28) (match_operand:VF_128 2 "register_operand" "v") (const_int 1)))] @@ -16838,7 +18774,7 @@ [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")] + [(match_operand:VF_128 1 "<round_saeonly_nimm_scalar_predicate>" "<round_saeonly_constraint>")] UNSPEC_RSQRT28) (match_operand:VF_128 2 "register_operand" "v") (const_int 1)))] @@ -17886,10 +19822,17 @@ (set_attr "mode" "OI")]) ;; Clear the upper 128bits of AVX registers, equivalent to a NOP -;; if the upper 128bits are unused. -(define_insn "avx_vzeroupper" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)] - "TARGET_AVX" +;; if the upper 128bits are unused. Initially we expand the instructions +;; as though they had no effect on the SSE registers, but later add SETs and +;; CLOBBERs to the PARALLEL to model the real effect. +(define_expand "avx_vzeroupper" + [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX") + +(define_insn "*avx_vzeroupper" + [(match_parallel 0 "vzeroupper_pattern" + [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" "vzeroupper" [(set_attr "type" "sse") (set_attr "modrm" "0") @@ -17898,6 +19841,44 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx_vzeroupper_1" + [(match_parallel 0 "vzeroupper_pattern" + [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" + "#" + "&& epilogue_completed" + [(match_dup 0)] +{ + /* For IPA-RA purposes, make it clear the instruction clobbers + even XMM registers not mentioned explicitly in the pattern. */ + unsigned int nregs = TARGET_64BIT ? 16 : 8; + unsigned int npats = XVECLEN (operands[0], 0); + rtvec vec = rtvec_alloc (nregs + 1); + RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); + for (unsigned int i = 0, j = 1; i < nregs; ++i) + { + unsigned int regno = GET_SSE_REGNO (i); + if (j < npats + && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) + { + RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); + j++; + } + else + { + rtx reg = gen_rtx_REG (V2DImode, regno); + RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); + } + } + operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); +} + [(set_attr "type" "sse") + (set_attr "modrm" "0") + (set_attr "memory" "none") + (set_attr "prefix" "vex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "OI")]) + (define_mode_attr pbroadcast_evex_isa [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") @@ -17972,6 +19953,165 @@ (set_attr "prefix" "<mask_prefix2>") (set_attr "mode" "<sseinsnmode>")]) +;; Recognize broadcast as a vec_select as produced by builtin_vec_perm. +;; If it so happens that the input is in memory, use vbroadcast. +;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). +(define_insn "*avx_vperm_broadcast_v4sf" + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v") + (vec_select:V4SF + (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX" +{ + int elt = INTVAL (operands[3]); + switch (which_alternative) + { + case 0: + case 1: + operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4); + return "vbroadcastss\t{%1, %0|%0, %k1}"; + case 2: + operands[2] = GEN_INT (elt * 0x55); + return "vpermilps\t{%2, %1, %0|%0, %1, %2}"; + default: + gcc_unreachable (); + } +} + [(set_attr "type" "ssemov,ssemov,sselog1") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "0,0,1") + (set_attr "prefix" "maybe_evex") + (set_attr "mode" "SF,SF,V4SF")]) + +(define_insn_and_split "*avx_vperm_broadcast_<mode>" + [(set (match_operand:VF_256 0 "register_operand" "=v,v,v") + (vec_select:VF_256 + (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v") + (match_parallel 2 "avx_vbroadcast_operand" + [(match_operand 3 "const_int_operand" "C,n,n")])))] + "TARGET_AVX + && (<MODE>mode != V4DFmode || !TARGET_AVX2 || operands[3] == const0_rtx)" + "#" + "&& reload_completed" + [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))] +{ + rtx op0 = operands[0], op1 = operands[1]; + int elt = INTVAL (operands[3]); + + if (REG_P (op1)) + { + int mask; + + if (TARGET_AVX2 && elt == 0) + { + emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode, + op1))); + DONE; + } + + /* Shuffle element we care about into all elements of the 128-bit lane. + The other lane gets shuffled too, but we don't care. */ + if (<MODE>mode == V4DFmode) + mask = (elt & 1 ? 15 : 0); + else + mask = (elt & 3) * 0x55; + emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask))); + + /* Shuffle the lane we care about into both lanes of the dest. */ + mask = (elt / (<ssescalarnum> / 2)) * 0x11; + if (EXT_REX_SSE_REG_P (op0)) + { + /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS + or VSHUFF128. */ + gcc_assert (<MODE>mode == V8SFmode); + if ((mask & 1) == 0) + emit_insn (gen_avx2_vec_dupv8sf (op0, + gen_lowpart (V4SFmode, op0))); + else + emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0, + GEN_INT (4), GEN_INT (5), + GEN_INT (6), GEN_INT (7), + GEN_INT (12), GEN_INT (13), + GEN_INT (14), GEN_INT (15))); + DONE; + } + + emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask))); + DONE; + } + + operands[1] = adjust_address (op1, <ssescalarmode>mode, + elt * GET_MODE_SIZE (<ssescalarmode>mode)); +}) + +(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>" + [(set (match_operand:VF2 0 "register_operand") + (vec_select:VF2 + (match_operand:VF2 1 "nonimmediate_operand") + (match_operand:SI 2 "const_0_to_255_operand")))] + "TARGET_AVX && <mask_mode512bit_condition>" +{ + int mask = INTVAL (operands[2]); + rtx perm[<ssescalarnum>]; + + int i; + for (i = 0; i < <ssescalarnum>; i = i + 2) + { + perm[i] = GEN_INT (((mask >> i) & 1) + i); + perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i); + } + + operands[2] + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm)); +}) + +(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>" + [(set (match_operand:VF1 0 "register_operand") + (vec_select:VF1 + (match_operand:VF1 1 "nonimmediate_operand") + (match_operand:SI 2 "const_0_to_255_operand")))] + "TARGET_AVX && <mask_mode512bit_condition>" +{ + int mask = INTVAL (operands[2]); + rtx perm[<ssescalarnum>]; + + int i; + for (i = 0; i < <ssescalarnum>; i = i + 4) + { + perm[i] = GEN_INT (((mask >> 0) & 3) + i); + perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i); + perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i); + perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i); + } + + operands[2] + = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm)); +}) + +;; This pattern needs to come before the avx2_perm*/avx512f_perm* +;; patterns, as they have the same RTL representation (vpermilp* +;; being a subset of what vpermp* can do), but vpermilp* has shorter +;; latency as it never crosses lanes. +(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>" + [(set (match_operand:VF 0 "register_operand" "=v") + (vec_select:VF + (match_operand:VF 1 "nonimmediate_operand" "vm") + (match_parallel 2 "" + [(match_operand 3 "const_int_operand")])))] + "TARGET_AVX && <mask_mode512bit_condition> + && avx_vpermilp_parallel (operands[2], <MODE>mode)" +{ + int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1; + operands[2] = GEN_INT (mask); + return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}"; +} + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "<mask_prefix>") + (set_attr "mode" "<sseinsnmode>")]) + (define_expand "avx2_perm<mode>" [(match_operand:VI8F_256 0 "register_operand") (match_operand:VI8F_256 1 "nonimmediate_operand") @@ -18455,7 +20595,7 @@ [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v") (vec_duplicate:VI8_AVX512VL (zero_extend:DI - (match_operand:QI 1 "register_operand" "Yk"))))] + (match_operand:QI 1 "register_operand" "k"))))] "TARGET_AVX512CD" "vpbroadcastmb2q\t{%1, %0|%0, %1}" [(set_attr "type" "mskmov") @@ -18466,167 +20606,13 @@ [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v") (vec_duplicate:VI4_AVX512VL (zero_extend:SI - (match_operand:HI 1 "register_operand" "Yk"))))] + (match_operand:HI 1 "register_operand" "k"))))] "TARGET_AVX512CD" "vpbroadcastmw2d\t{%1, %0|%0, %1}" [(set_attr "type" "mskmov") (set_attr "prefix" "evex") (set_attr "mode" "XI")]) -;; Recognize broadcast as a vec_select as produced by builtin_vec_perm. -;; If it so happens that the input is in memory, use vbroadcast. -;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128). -(define_insn "*avx_vperm_broadcast_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,v") - (vec_select:V4SF - (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v") - (match_parallel 2 "avx_vbroadcast_operand" - [(match_operand 3 "const_int_operand" "C,n,n")])))] - "TARGET_AVX" -{ - int elt = INTVAL (operands[3]); - switch (which_alternative) - { - case 0: - case 1: - operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4); - return "vbroadcastss\t{%1, %0|%0, %k1}"; - case 2: - operands[2] = GEN_INT (elt * 0x55); - return "vpermilps\t{%2, %1, %0|%0, %1, %2}"; - default: - gcc_unreachable (); - } -} - [(set_attr "type" "ssemov,ssemov,sselog1") - (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "0,0,1") - (set_attr "prefix" "maybe_evex") - (set_attr "mode" "SF,SF,V4SF")]) - -(define_insn_and_split "*avx_vperm_broadcast_<mode>" - [(set (match_operand:VF_256 0 "register_operand" "=v,v,v") - (vec_select:VF_256 - (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v") - (match_parallel 2 "avx_vbroadcast_operand" - [(match_operand 3 "const_int_operand" "C,n,n")])))] - "TARGET_AVX" - "#" - "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)" - [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))] -{ - rtx op0 = operands[0], op1 = operands[1]; - int elt = INTVAL (operands[3]); - - if (REG_P (op1)) - { - int mask; - - if (TARGET_AVX2 && elt == 0) - { - emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode, - op1))); - DONE; - } - - /* Shuffle element we care about into all elements of the 128-bit lane. - The other lane gets shuffled too, but we don't care. */ - if (<MODE>mode == V4DFmode) - mask = (elt & 1 ? 15 : 0); - else - mask = (elt & 3) * 0x55; - emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask))); - - /* Shuffle the lane we care about into both lanes of the dest. */ - mask = (elt / (<ssescalarnum> / 2)) * 0x11; - if (EXT_REX_SSE_REG_P (op0)) - { - /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS - or VSHUFF128. */ - gcc_assert (<MODE>mode == V8SFmode); - if ((mask & 1) == 0) - emit_insn (gen_avx2_vec_dupv8sf (op0, - gen_lowpart (V4SFmode, op0))); - else - emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0, - GEN_INT (4), GEN_INT (5), - GEN_INT (6), GEN_INT (7), - GEN_INT (12), GEN_INT (13), - GEN_INT (14), GEN_INT (15))); - DONE; - } - - emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask))); - DONE; - } - - operands[1] = adjust_address (op1, <ssescalarmode>mode, - elt * GET_MODE_SIZE (<ssescalarmode>mode)); -}) - -(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>" - [(set (match_operand:VF2 0 "register_operand") - (vec_select:VF2 - (match_operand:VF2 1 "nonimmediate_operand") - (match_operand:SI 2 "const_0_to_255_operand")))] - "TARGET_AVX && <mask_mode512bit_condition>" -{ - int mask = INTVAL (operands[2]); - rtx perm[<ssescalarnum>]; - - int i; - for (i = 0; i < <ssescalarnum>; i = i + 2) - { - perm[i] = GEN_INT (((mask >> i) & 1) + i); - perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i); - } - - operands[2] - = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm)); -}) - -(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>" - [(set (match_operand:VF1 0 "register_operand") - (vec_select:VF1 - (match_operand:VF1 1 "nonimmediate_operand") - (match_operand:SI 2 "const_0_to_255_operand")))] - "TARGET_AVX && <mask_mode512bit_condition>" -{ - int mask = INTVAL (operands[2]); - rtx perm[<ssescalarnum>]; - - int i; - for (i = 0; i < <ssescalarnum>; i = i + 4) - { - perm[i] = GEN_INT (((mask >> 0) & 3) + i); - perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i); - perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i); - perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i); - } - - operands[2] - = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm)); -}) - -(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>" - [(set (match_operand:VF 0 "register_operand" "=v") - (vec_select:VF - (match_operand:VF 1 "nonimmediate_operand" "vm") - (match_parallel 2 "" - [(match_operand 3 "const_int_operand")])))] - "TARGET_AVX && <mask_mode512bit_condition> - && avx_vpermilp_parallel (operands[2], <MODE>mode)" -{ - int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1; - operands[2] = GEN_INT (mask); - return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}"; -} - [(set_attr "type" "sselog") - (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "1") - (set_attr "prefix" "<mask_prefix>") - (set_attr "mode" "<sseinsnmode>")]) - (define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3<mask_name>" [(set (match_operand:VF 0 "register_operand" "=v") (unspec:VF @@ -19185,9 +21171,9 @@ (define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>" [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m") - (unspec:AVX256MODE2P - [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "xm,x")] - UNSPEC_CAST))] + (vec_concat:AVX256MODE2P + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "xm,x") + (unspec:<ssehalfvecmode> [(const_int 0)] UNSPEC_CAST)))] "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "#" "&& reload_completed" @@ -19286,9 +21272,11 @@ (define_insn "avx_vec_concat<mode>" [(set (match_operand:V_256_512 0 "register_operand" "=x,v,x,Yv") (vec_concat:V_256_512 - (match_operand:<ssehalfvecmode> 1 "register_operand" "x,v,x,v") + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "x,v,xm,vm") (match_operand:<ssehalfvecmode> 2 "nonimm_or_0_operand" "xm,vm,C,C")))] - "TARGET_AVX" + "TARGET_AVX + && (operands[2] == CONST0_RTX (<ssehalfvecmode>mode) + || !MEM_P (operands[1]))" { switch (which_alternative) { @@ -19314,27 +21302,63 @@ switch (get_attr_mode (insn)) { case MODE_V16SF: - return "vmovaps\t{%1, %t0|%t0, %1}"; + if (misaligned_operand (operands[1], <ssehalfvecmode>mode)) + return "vmovups\t{%1, %t0|%t0, %1}"; + else + return "vmovaps\t{%1, %t0|%t0, %1}"; case MODE_V8DF: - return "vmovapd\t{%1, %t0|%t0, %1}"; + if (misaligned_operand (operands[1], <ssehalfvecmode>mode)) + return "vmovupd\t{%1, %t0|%t0, %1}"; + else + return "vmovapd\t{%1, %t0|%t0, %1}"; case MODE_V8SF: - return "vmovaps\t{%1, %x0|%x0, %1}"; + if (misaligned_operand (operands[1], <ssehalfvecmode>mode)) + return "vmovups\t{%1, %x0|%x0, %1}"; + else + return "vmovaps\t{%1, %x0|%x0, %1}"; case MODE_V4DF: - return "vmovapd\t{%1, %x0|%x0, %1}"; + if (misaligned_operand (operands[1], <ssehalfvecmode>mode)) + return "vmovupd\t{%1, %x0|%x0, %1}"; + else + return "vmovapd\t{%1, %x0|%x0, %1}"; case MODE_XI: - if (which_alternative == 2) - return "vmovdqa\t{%1, %t0|%t0, %1}"; - else if (GET_MODE_SIZE (<ssescalarmode>mode) == 8) - return "vmovdqa64\t{%1, %t0|%t0, %1}"; + if (misaligned_operand (operands[1], <ssehalfvecmode>mode)) + { + if (which_alternative == 2) + return "vmovdqu\t{%1, %t0|%t0, %1}"; + else if (GET_MODE_SIZE (<ssescalarmode>mode) == 8) + return "vmovdqu64\t{%1, %t0|%t0, %1}"; + else + return "vmovdqu32\t{%1, %t0|%t0, %1}"; + } else - return "vmovdqa32\t{%1, %t0|%t0, %1}"; + { + if (which_alternative == 2) + return "vmovdqa\t{%1, %t0|%t0, %1}"; + else if (GET_MODE_SIZE (<ssescalarmode>mode) == 8) + return "vmovdqa64\t{%1, %t0|%t0, %1}"; + else + return "vmovdqa32\t{%1, %t0|%t0, %1}"; + } case MODE_OI: - if (which_alternative == 2) - return "vmovdqa\t{%1, %x0|%x0, %1}"; - else if (GET_MODE_SIZE (<ssescalarmode>mode) == 8) - return "vmovdqa64\t{%1, %x0|%x0, %1}"; + if (misaligned_operand (operands[1], <ssehalfvecmode>mode)) + { + if (which_alternative == 2) + return "vmovdqu\t{%1, %x0|%x0, %1}"; + else if (GET_MODE_SIZE (<ssescalarmode>mode) == 8) + return "vmovdqu64\t{%1, %x0|%x0, %1}"; + else + return "vmovdqu32\t{%1, %x0|%x0, %1}"; + } else - return "vmovdqa32\t{%1, %x0|%x0, %1}"; + { + if (which_alternative == 2) + return "vmovdqa\t{%1, %x0|%x0, %1}"; + else if (GET_MODE_SIZE (<ssescalarmode>mode) == 8) + return "vmovdqa64\t{%1, %x0|%x0, %1}"; + else + return "vmovdqa32\t{%1, %x0|%x0, %1}"; + } default: gcc_unreachable (); } @@ -19506,7 +21530,7 @@ operands[5]), UNSPEC_VSIBADDR); }) -(define_insn "*avx2_gathersi<mode>" +(define_insn "*avx2_gathersi<VEC_GATHER_MODE:mode>" [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") (unspec:VEC_GATHER_MODE [(match_operand:VEC_GATHER_MODE 2 "register_operand" "0") @@ -19521,12 +21545,12 @@ UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx2_gathersi<mode>_2" + "%M3v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx2_gathersi<VEC_GATHER_MODE:mode>_2" [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") (unspec:VEC_GATHER_MODE [(pc) @@ -19541,7 +21565,7 @@ UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %6, %0|%0, %6, %1}" + "%M2v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %6, %0|%0, %6, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) @@ -19567,7 +21591,7 @@ operands[5]), UNSPEC_VSIBADDR); }) -(define_insn "*avx2_gatherdi<mode>" +(define_insn "*avx2_gatherdi<VEC_GATHER_MODE:mode>" [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") (unspec:VEC_GATHER_MODE [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0") @@ -19582,12 +21606,12 @@ UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx2_gatherdi<mode>_2" + "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx2_gatherdi<VEC_GATHER_MODE:mode>_2" [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") (unspec:VEC_GATHER_MODE [(pc) @@ -19603,15 +21627,15 @@ (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" { - if (<MODE>mode != <VEC_GATHER_SRCDI>mode) - return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %x0|%x0, %6, %4}"; - return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}"; -} - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx2_gatherdi<mode>_3" + if (<VEC_GATHER_MODE:MODE>mode != <VEC_GATHER_SRCDI>mode) + return "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %x0|%x0, %6, %4}"; + return "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}"; +} + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx2_gatherdi<VI4F_256:mode>_3" [(set (match_operand:<VEC_GATHER_SRCDI> 0 "register_operand" "=&x") (vec_select:<VEC_GATHER_SRCDI> (unspec:VI4F_256 @@ -19629,12 +21653,12 @@ (const_int 2) (const_int 3)]))) (clobber (match_scratch:VI4F_256 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %0|%0, %7, %5}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx2_gatherdi<mode>_4" + "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %0|%0, %7, %5}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx2_gatherdi<VI4F_256:mode>_4" [(set (match_operand:<VEC_GATHER_SRCDI> 0 "register_operand" "=&x") (vec_select:<VEC_GATHER_SRCDI> (unspec:VI4F_256 @@ -19652,16 +21676,10 @@ (const_int 2) (const_int 3)]))) (clobber (match_scratch:VI4F_256 1 "=&x"))] "TARGET_AVX2" - "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "vex") - (set_attr "mode" "<sseinsnmode>")]) - -;; Memory operand override for -masm=intel of the v*gatherq* patterns. -(define_mode_attr gatherq_mode - [(V4SI "q") (V2DI "x") (V4SF "q") (V2DF "x") - (V8SI "x") (V4DI "t") (V8SF "x") (V4DF "t") - (V16SI "t") (V8DI "g") (V16SF "t") (V8DF "g")]) + "%M2v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "<sseinsnmode>")]) (define_expand "<avx512>_gathersi<mode>" [(parallel [(set (match_operand:VI48F 0 "register_operand") @@ -19682,7 +21700,7 @@ operands[5]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512f_gathersi<mode>" +(define_insn "*avx512f_gathersi<VI48F:mode>" [(set (match_operand:VI48F 0 "register_operand" "=&v") (unspec:VI48F [(match_operand:VI48F 1 "register_operand" "0") @@ -19696,12 +21714,14 @@ UNSPEC_GATHER)) (clobber (match_scratch:<avx512fmaskmode> 2 "=&Yk"))] "TARGET_AVX512F" - "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %<xtg_mode>6}" - [(set_attr "type" "ssemov") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx512f_gathersi<mode>_2" +;; %X6 so that we don't emit any *WORD PTR for -masm=intel, as +;; gas changed what it requires incompatibly. + "%M4v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %X6}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512f_gathersi<VI48F:mode>_2" [(set (match_operand:VI48F 0 "register_operand" "=&v") (unspec:VI48F [(pc) @@ -19715,7 +21735,9 @@ UNSPEC_GATHER)) (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))] "TARGET_AVX512F" - "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<xtg_mode>5}" +;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as +;; gas changed what it requires incompatibly. + "%M3v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %X5}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -19740,7 +21762,7 @@ operands[5]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512f_gatherdi<mode>" +(define_insn "*avx512f_gatherdi<VI48F:mode>" [(set (match_operand:VI48F 0 "register_operand" "=&v") (unspec:VI48F [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "0") @@ -19754,14 +21776,14 @@ UNSPEC_GATHER)) (clobber (match_scratch:QI 2 "=&Yk"))] "TARGET_AVX512F" -{ - return "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %<gatherq_mode>6}"; -} - [(set_attr "type" "ssemov") - (set_attr "prefix" "evex") - (set_attr "mode" "<sseinsnmode>")]) - -(define_insn "*avx512f_gatherdi<mode>_2" +;; %X6 so that we don't emit any *WORD PTR for -masm=intel, as +;; gas changed what it requires incompatibly. + "%M4v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %X6}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "mode" "<sseinsnmode>")]) + +(define_insn "*avx512f_gatherdi<VI48F:mode>_2" [(set (match_operand:VI48F 0 "register_operand" "=&v") (unspec:VI48F [(pc) @@ -19776,14 +21798,16 @@ (clobber (match_scratch:QI 1 "=&Yk"))] "TARGET_AVX512F" { - if (<MODE>mode != <VEC_GATHER_SRCDI>mode) - { - if (<MODE_SIZE> != 64) - return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %<gatherq_mode>5}"; + /* %X5 so that we don't emit any *WORD PTR for -masm=intel, as + gas changed what it requires incompatibly. */ + if (<VI48F:MODE>mode != <VEC_GATHER_SRCDI>mode) + { + if (<VI48F:MODE_SIZE> != 64) + return "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %x0%{%1%}|%x0%{%1%}, %X5}"; else - return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %t5}"; - } - return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %<gatherq_mode>5}"; + return "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %X5}"; + } + return "%M3v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %X5}"; } [(set_attr "type" "ssemov") (set_attr "prefix" "evex") @@ -19807,7 +21831,7 @@ operands[4]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512f_scattersi<mode>" +(define_insn "*avx512f_scattersi<VI48F:mode>" [(set (match_operator:VI48F 5 "vsib_mem_operator" [(unspec:P [(match_operand:P 0 "vsib_address_operand" "Tv") @@ -19820,7 +21844,9 @@ UNSPEC_SCATTER)) (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))] "TARGET_AVX512F" - "v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}" +;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as +;; gas changed what it requires incompatibly. + "%M0v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%X5%{%1%}, %3}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -19843,7 +21869,7 @@ operands[4]), UNSPEC_VSIBADDR); }) -(define_insn "*avx512f_scatterdi<mode>" +(define_insn "*avx512f_scatterdi<VI48F:mode>" [(set (match_operator:VI48F 5 "vsib_mem_operator" [(unspec:P [(match_operand:P 0 "vsib_address_operand" "Tv") @@ -19856,11 +21882,9 @@ UNSPEC_SCATTER)) (clobber (match_scratch:QI 1 "=&Yk"))] "TARGET_AVX512F" -{ - if (GET_MODE_SIZE (GET_MODE_INNER (<MODE>mode)) == 8) - return "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}"; - return "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%t5%{%1%}, %3}"; -} +;; %X5 so that we don't emit any *WORD PTR for -masm=intel, as +;; gas changed what it requires incompatibly. + "%M0v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%X5%{%1%}, %3}" [(set_attr "type" "ssemov") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -19997,28 +22021,28 @@ (set_attr "mode" "<MODE>")]) (define_insn "avx512dq_fpclass<mode><mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> - [(match_operand:VF_AVX512VL 1 "register_operand" "v") + [(match_operand:VF_AVX512VL 1 "vector_operand" "vm") (match_operand:QI 2 "const_0_to_255_operand" "n")] UNSPEC_FPCLASS))] "TARGET_AVX512DQ" - "vfpclass<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"; + "vfpclass<ssemodesuffix><vecmemsuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"; [(set_attr "type" "sse") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "<MODE>")]) -(define_insn "avx512dq_vmfpclass<mode>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") +(define_insn "avx512dq_vmfpclass<mode><mask_scalar_merge_name>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (and:<avx512fmaskmode> (unspec:<avx512fmaskmode> - [(match_operand:VF_128 1 "register_operand" "v") + [(match_operand:VF_128 1 "nonimmediate_operand" "vm") (match_operand:QI 2 "const_0_to_255_operand" "n")] UNSPEC_FPCLASS) (const_int 1)))] "TARGET_AVX512DQ" - "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"; + "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"; [(set_attr "type" "sse") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -20170,9 +22194,11 @@ (define_insn_and_split "avx512f_<castmode><avxsizesuffix>_<castmode>" [(set (match_operand:AVX512MODE2P 0 "nonimmediate_operand" "=x,m") - (unspec:AVX512MODE2P - [(match_operand:<ssequartermode> 1 "nonimmediate_operand" "xm,x")] - UNSPEC_CAST))] + (vec_concat:AVX512MODE2P + (vec_concat:<ssehalfvecmode> + (match_operand:<ssequartermode> 1 "nonimmediate_operand" "xm,x") + (unspec:<ssequartermode> [(const_int 0)] UNSPEC_CAST)) + (unspec:<ssehalfvecmode> [(const_int 0)] UNSPEC_CAST)))] "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "#" "&& reload_completed" @@ -20187,9 +22213,9 @@ (define_insn_and_split "avx512f_<castmode><avxsizesuffix>_256<castmode>" [(set (match_operand:AVX512MODE2P 0 "nonimmediate_operand" "=x,m") - (unspec:AVX512MODE2P - [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "xm,x")] - UNSPEC_CAST))] + (vec_concat:AVX512MODE2P + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "xm,x") + (unspec:<ssehalfvecmode> [(const_int 0)] UNSPEC_CAST)))] "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "#" "&& reload_completed" @@ -20571,21 +22597,21 @@ "vpopcnt<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}") ;; Save multiple registers out-of-line. -(define_insn "save_multiple<mode>" +(define_insn "*save_multiple<mode>" [(match_parallel 0 "save_multiple" [(use (match_operand:P 1 "symbol_operand"))])] "TARGET_SSE && TARGET_64BIT" "call\t%P1") ;; Restore multiple registers out-of-line. -(define_insn "restore_multiple<mode>" +(define_insn "*restore_multiple<mode>" [(match_parallel 0 "restore_multiple" [(use (match_operand:P 1 "symbol_operand"))])] "TARGET_SSE && TARGET_64BIT" "call\t%P1") ;; Restore multiple registers out-of-line and return. -(define_insn "restore_multiple_and_return<mode>" +(define_insn "*restore_multiple_and_return<mode>" [(match_parallel 0 "restore_multiple" [(return) (use (match_operand:P 1 "symbol_operand")) @@ -20596,7 +22622,7 @@ ;; Restore multiple registers out-of-line when hard frame pointer is used, ;; perform the leave operation prior to returning (from the function). -(define_insn "restore_multiple_leave_return<mode>" +(define_insn "*restore_multiple_leave_return<mode>" [(match_parallel 0 "restore_multiple" [(return) (use (match_operand:P 1 "symbol_operand")) @@ -20615,56 +22641,53 @@ "vpopcnt<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}") (define_insn "vgf2p8affineinvqb_<mode><mask_name>" - [(set (match_operand:VI1_AVX512F 0 "register_operand" "=x,x,v") + [(set (match_operand:VI1_AVX512F 0 "register_operand" "=x,v") (unspec:VI1_AVX512F - [(match_operand:VI1_AVX512F 1 "register_operand" "%0,x,v") - (match_operand:VI1_AVX512F 2 "nonimmediate_operand" "xBm,xm,vm") - (match_operand:QI 3 "const_0_to_255_operand" "n,n,n")] + [(match_operand:VI1_AVX512F 1 "register_operand" "0,v") + (match_operand:VI1_AVX512F 2 "vector_operand" "xBm,vm") + (match_operand:QI 3 "const_0_to_255_operand" "n,n")] UNSPEC_GF2P8AFFINEINV))] "TARGET_GFNI" "@ gf2p8affineinvqb\t{%3, %2, %0| %0, %2, %3} - vgf2p8affineinvqb\t{%3, %2, %1, %0<mask_operand4>| %0<mask_operand4>, %1, %2, %3} vgf2p8affineinvqb\t{%3, %2, %1, %0<mask_operand4>| %0<mask_operand4>, %1, %2, %3}" - [(set_attr "isa" "noavx,avx,avx512f") - (set_attr "prefix_data16" "1,*,*") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,maybe_evex,evex") + [(set_attr "isa" "noavx,avx") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "<sseinsnmode>")]) (define_insn "vgf2p8affineqb_<mode><mask_name>" - [(set (match_operand:VI1_AVX512F 0 "register_operand" "=x,x,v") + [(set (match_operand:VI1_AVX512F 0 "register_operand" "=x,v") (unspec:VI1_AVX512F - [(match_operand:VI1_AVX512F 1 "register_operand" "%0,x,v") - (match_operand:VI1_AVX512F 2 "nonimmediate_operand" "xBm,xm,vm") - (match_operand:QI 3 "const_0_to_255_operand" "n,n,n")] + [(match_operand:VI1_AVX512F 1 "register_operand" "0,v") + (match_operand:VI1_AVX512F 2 "vector_operand" "xBm,vm") + (match_operand:QI 3 "const_0_to_255_operand" "n,n")] UNSPEC_GF2P8AFFINE))] "TARGET_GFNI" "@ gf2p8affineqb\t{%3, %2, %0| %0, %2, %3} - vgf2p8affineqb\t{%3, %2, %1, %0<mask_operand4>| %0<mask_operand4>, %1, %2, %3} vgf2p8affineqb\t{%3, %2, %1, %0<mask_operand4>| %0<mask_operand4>, %1, %2, %3}" - [(set_attr "isa" "noavx,avx,avx512f") - (set_attr "prefix_data16" "1,*,*") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,maybe_evex,evex") + [(set_attr "isa" "noavx,avx") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "<sseinsnmode>")]) (define_insn "vgf2p8mulb_<mode><mask_name>" - [(set (match_operand:VI1_AVX512F 0 "register_operand" "=x,x,v") + [(set (match_operand:VI1_AVX512F 0 "register_operand" "=x,v") (unspec:VI1_AVX512F - [(match_operand:VI1_AVX512F 1 "register_operand" "%0,x,v") - (match_operand:VI1_AVX512F 2 "nonimmediate_operand" "xBm,xm,vm")] + [(match_operand:VI1_AVX512F 1 "register_operand" "%0,v") + (match_operand:VI1_AVX512F 2 "vector_operand" "xBm,vm")] UNSPEC_GF2P8MUL))] "TARGET_GFNI" "@ gf2p8mulb\t{%2, %0| %0, %2} - vgf2p8mulb\t{%2, %1, %0<mask_operand3>| %0<mask_operand3>, %1, %2} vgf2p8mulb\t{%2, %1, %0<mask_operand3>| %0<mask_operand3>, %1, %2}" - [(set_attr "isa" "noavx,avx,avx512f") - (set_attr "prefix_data16" "1,*,*") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,maybe_evex,evex") + [(set_attr "isa" "noavx,avx") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "<sseinsnmode>")]) (define_insn "vpshrd_<mode><mask_name>" @@ -21026,7 +23049,7 @@ [(set (match_operand:VI1_AVX512VL_F 0 "register_operand" "=v") (unspec:VI1_AVX512VL_F [(match_operand:VI1_AVX512VL_F 1 "register_operand" "v") - (match_operand:VI1_AVX512VL_F 2 "vector_operand" "v")] + (match_operand:VI1_AVX512VL_F 2 "vector_operand" "vm")] UNSPEC_VAESDEC))] "TARGET_VAES" "vaesdec\t{%2, %1, %0|%0, %1, %2}" @@ -21036,7 +23059,7 @@ [(set (match_operand:VI1_AVX512VL_F 0 "register_operand" "=v") (unspec:VI1_AVX512VL_F [(match_operand:VI1_AVX512VL_F 1 "register_operand" "v") - (match_operand:VI1_AVX512VL_F 2 "vector_operand" "v")] + (match_operand:VI1_AVX512VL_F 2 "vector_operand" "vm")] UNSPEC_VAESDECLAST))] "TARGET_VAES" "vaesdeclast\t{%2, %1, %0|%0, %1, %2}" @@ -21073,7 +23096,7 @@ [(set_attr "mode" "DI")]) (define_insn "avx512vl_vpshufbitqmb<mode><mask_scalar_merge_name>" - [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk") + [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=k") (unspec:<avx512fmaskmode> [(match_operand:VI1_AVX512VLBW 1 "register_operand" "v") (match_operand:VI1_AVX512VLBW 2 "nonimmediate_operand" "vm")] @@ -21082,3 +23105,114 @@ "vpshufbitqmb\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}" [(set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) + +(define_mode_iterator VI48_AVX512VP2VL + [V8DI + (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL") + (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")]) + +(define_insn "avx512vp2intersect_2intersect<mode>" + [(set (match_operand:P2QI 0 "register_operand" "=k") + (unspec:P2QI + [(match_operand:VI48_AVX512VP2VL 1 "register_operand" "v") + (match_operand:VI48_AVX512VP2VL 2 "vector_operand" "vm")] + UNSPEC_VP2INTERSECT))] + "TARGET_AVX512VP2INTERSECT" + "vp2intersect<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr ("prefix") ("evex"))]) + +(define_insn "avx512vp2intersect_2intersectv16si" + [(set (match_operand:P2HI 0 "register_operand" "=k") + (unspec:P2HI [(match_operand:V16SI 1 "register_operand" "v") + (match_operand:V16SI 2 "vector_operand" "vm")] + UNSPEC_VP2INTERSECT))] + "TARGET_AVX512VP2INTERSECT" + "vp2intersectd\t{%2, %1, %0|%0, %1, %2}" + [(set_attr ("prefix") ("evex"))]) + +(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) +;; Converting from BF to SF +(define_mode_attr bf16_cvt_2sf + [(V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")]) +;; Converting from SF to BF +(define_mode_attr sf_cvt_bf16 + [(V4SF "V8HI") (V8SF "V8HI") (V16SF "V16HI")]) +;; Mapping from BF to SF +(define_mode_attr sf_bf16 + [(V4SF "V8HI") (V8SF "V16HI") (V16SF "V32HI")]) + +(define_expand "avx512f_cvtne2ps2bf16_<mode>_maskz" + [(match_operand:BF16 0 "register_operand") + (match_operand:<bf16_cvt_2sf> 1 "register_operand") + (match_operand:<bf16_cvt_2sf> 2 "register_operand") + (match_operand:<avx512fmaskmode> 3 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_cvtne2ps2bf16_<mode>_mask(operands[0], operands[1], + operands[2], CONST0_RTX(<MODE>mode), operands[3])); + DONE; +}) + +(define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>" + [(set (match_operand:BF16 0 "register_operand" "=v") + (unspec:BF16 + [(match_operand:<bf16_cvt_2sf> 1 "register_operand" "v") + (match_operand:<bf16_cvt_2sf> 2 "register_operand" "v")] + UNSPEC_VCVTNE2PS2BF16))] + "TARGET_AVX512BF16" + "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}") + +(define_expand "avx512f_cvtneps2bf16_<mode>_maskz" + [(match_operand:<sf_cvt_bf16> 0 "register_operand") + (match_operand:VF1_AVX512VL 1 "register_operand") + (match_operand:<avx512fmaskmode> 2 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_cvtneps2bf16_<mode>_mask(operands[0], operands[1], + CONST0_RTX(<sf_cvt_bf16>mode), operands[2])); + DONE; +}) + +(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>" + [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v") + (unspec:<sf_cvt_bf16> + [(match_operand:VF1_AVX512VL 1 "register_operand" "v")] + UNSPEC_VCVTNEPS2BF16))] + "TARGET_AVX512BF16" + "vcvtneps2bf16\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}") + +(define_expand "avx512f_dpbf16ps_<mode>_maskz" + [(match_operand:VF1_AVX512VL 0 "register_operand") + (match_operand:VF1_AVX512VL 1 "register_operand") + (match_operand:<sf_bf16> 2 "register_operand") + (match_operand:<sf_bf16> 3 "register_operand") + (match_operand:<avx512fmaskhalfmode> 4 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_dpbf16ps_<mode>_maskz_1(operands[0], operands[1], + operands[2], operands[3], CONST0_RTX(<MODE>mode), operands[4])); + DONE; +}) + +(define_insn "avx512f_dpbf16ps_<mode><maskz_half_name>" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v") + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "register_operand" "0") + (match_operand:<sf_bf16> 2 "register_operand" "v") + (match_operand:<sf_bf16> 3 "register_operand" "v")] + UNSPEC_VDPBF16PS))] + "TARGET_AVX512BF16" + "vdpbf16ps\t{%3, %2, %0<maskz_half_operand4>|%0<maskz_half_operand4>, %2, %3}") + +(define_insn "avx512f_dpbf16ps_<mode>_mask" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v") + (vec_merge:VF1_AVX512VL + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "register_operand" "0") + (match_operand:<sf_bf16> 2 "register_operand" "v") + (match_operand:<sf_bf16> 3 "register_operand" "v")] + UNSPEC_VDPBF16PS) + (match_dup 1) + (match_operand:<avx512fmaskhalfmode> 4 "register_operand" "Yk")))] + "TARGET_AVX512BF16" + "vdpbf16ps\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}")