Mercurial > hg > CbC > CbC_gcc
diff gcc/config/aarch64/aarch64-simd.md @ 131:84e7813d76e9
gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 07:37:49 +0900 |
parents | 04ced10e8804 |
children | 1830386684a0 |
line wrap: on
line diff
--- a/gcc/config/aarch64/aarch64-simd.md Fri Oct 27 22:46:09 2017 +0900 +++ b/gcc/config/aarch64/aarch64-simd.md Thu Oct 25 07:37:49 2018 +0900 @@ -1,5 +1,5 @@ ;; Machine description for AArch64 AdvSIMD architecture. -;; Copyright (C) 2011-2017 Free Software Foundation, Inc. +;; Copyright (C) 2011-2018 Free Software Foundation, Inc. ;; Contributed by ARM Ltd. ;; ;; This file is part of GCC. @@ -31,9 +31,9 @@ normal str, so the check need not apply. */ if (GET_CODE (operands[0]) == MEM && !(aarch64_simd_imm_zero (operands[1], <MODE>mode) - && ((GET_MODE_SIZE (<MODE>mode) == 16 + && ((known_eq (GET_MODE_SIZE (<MODE>mode), 16) && aarch64_mem_pair_operand (operands[0], DImode)) - || GET_MODE_SIZE (<MODE>mode) == 8))) + || known_eq (GET_MODE_SIZE (<MODE>mode), 8)))) operands[1] = force_reg (<MODE>mode, operands[1]); " ) @@ -80,7 +80,7 @@ )))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"; } [(set_attr "type" "neon_dup<q>")] @@ -95,14 +95,13 @@ )))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2])); return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"; } [(set_attr "type" "neon_dup<q>")] ) -(define_insn "*aarch64_simd_mov<mode>" +(define_insn "*aarch64_simd_mov<VD:mode>" [(set (match_operand:VD 0 "nonimmediate_operand" "=w, m, m, w, ?r, ?w, ?r, w") (match_operand:VD 1 "general_operand" @@ -121,19 +120,18 @@ case 5: return "fmov\t%d0, %1"; case 6: return "mov\t%0, %1"; case 7: - return aarch64_output_simd_mov_immediate (operands[1], - <MODE>mode, 64); + return aarch64_output_simd_mov_immediate (operands[1], 64); default: gcc_unreachable (); } } - [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\ + [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\ neon_logic<q>, neon_to_gp<q>, f_mcr,\ mov_reg, neon_move<q>")] ) -(define_insn "*aarch64_simd_mov<mode>" +(define_insn "*aarch64_simd_mov<VQ:mode>" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umn, m, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" "m, Dz, w, w, w, r, r, Dn"))] "TARGET_SIMD @@ -155,13 +153,13 @@ case 6: return "#"; case 7: - return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128); + return aarch64_output_simd_mov_immediate (operands[1], 128); default: gcc_unreachable (); } } - [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\ - neon_stp, neon_logic<q>, multiple, multiple,\ + [(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\ + neon_logic<q>, multiple, multiple,\ multiple, neon_move<q>") (set_attr "length" "4,4,4,4,8,8,8,4")] ) @@ -174,39 +172,67 @@ (vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w") (parallel [(match_operand 2 "const_int_operand" "n")])))] "TARGET_SIMD - && ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])) == 0" + && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0" "str\\t%<Vetype>1, %0" [(set_attr "type" "neon_store1_1reg<q>")] ) -(define_insn "load_pair<mode>" - [(set (match_operand:VD 0 "register_operand" "=w") - (match_operand:VD 1 "aarch64_mem_pair_operand" "Ump")) - (set (match_operand:VD 2 "register_operand" "=w") - (match_operand:VD 3 "memory_operand" "m"))] +(define_insn "load_pair<DREG:mode><DREG2:mode>" + [(set (match_operand:DREG 0 "register_operand" "=w") + (match_operand:DREG 1 "aarch64_mem_pair_operand" "Ump")) + (set (match_operand:DREG2 2 "register_operand" "=w") + (match_operand:DREG2 3 "memory_operand" "m"))] "TARGET_SIMD && rtx_equal_p (XEXP (operands[3], 0), plus_constant (Pmode, XEXP (operands[1], 0), - GET_MODE_SIZE (<MODE>mode)))" + GET_MODE_SIZE (<DREG:MODE>mode)))" "ldp\\t%d0, %d2, %1" [(set_attr "type" "neon_ldp")] ) -(define_insn "store_pair<mode>" - [(set (match_operand:VD 0 "aarch64_mem_pair_operand" "=Ump") - (match_operand:VD 1 "register_operand" "w")) - (set (match_operand:VD 2 "memory_operand" "=m") - (match_operand:VD 3 "register_operand" "w"))] +(define_insn "vec_store_pair<DREG:mode><DREG2:mode>" + [(set (match_operand:DREG 0 "aarch64_mem_pair_operand" "=Ump") + (match_operand:DREG 1 "register_operand" "w")) + (set (match_operand:DREG2 2 "memory_operand" "=m") + (match_operand:DREG2 3 "register_operand" "w"))] "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0), plus_constant (Pmode, XEXP (operands[0], 0), - GET_MODE_SIZE (<MODE>mode)))" + GET_MODE_SIZE (<DREG:MODE>mode)))" "stp\\t%d1, %d3, %0" [(set_attr "type" "neon_stp")] ) +(define_insn "load_pair<VQ:mode><VQ2:mode>" + [(set (match_operand:VQ 0 "register_operand" "=w") + (match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump")) + (set (match_operand:VQ2 2 "register_operand" "=w") + (match_operand:VQ2 3 "memory_operand" "m"))] + "TARGET_SIMD + && rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (<VQ:MODE>mode)))" + "ldp\\t%q0, %q2, %1" + [(set_attr "type" "neon_ldp_q")] +) + +(define_insn "vec_store_pair<VQ:mode><VQ2:mode>" + [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump") + (match_operand:VQ 1 "register_operand" "w")) + (set (match_operand:VQ2 2 "memory_operand" "=m") + (match_operand:VQ2 3 "register_operand" "w"))] + "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0), + plus_constant (Pmode, + XEXP (operands[0], 0), + GET_MODE_SIZE (<VQ:MODE>mode)))" + "stp\\t%q1, %q3, %0" + [(set_attr "type" "neon_stp_q")] +) + + (define_split [(set (match_operand:VQ 0 "register_operand" "") (match_operand:VQ 1 "register_operand" ""))] @@ -231,7 +257,7 @@ DONE; }) -(define_expand "aarch64_split_simd_mov<mode>" +(define_expand "@aarch64_split_simd_mov<mode>" [(set (match_operand:VQ 0) (match_operand:VQ 1))] "TARGET_SIMD" @@ -254,8 +280,8 @@ { rtx dst_low_part = gen_lowpart (<VHALF>mode, dst); rtx dst_high_part = gen_highpart (<VHALF>mode, dst); - rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); - rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); + rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_simd_mov_from_<mode>low (dst_low_part, src, lo)); @@ -451,8 +477,7 @@ DOTPROD)))] "TARGET_DOTPROD" { - operands[4] - = GEN_INT (ENDIAN_LANE_N (V8QImode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4])); return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]"; } [(set_attr "type" "neon_dot")] @@ -467,8 +492,7 @@ DOTPROD)))] "TARGET_DOTPROD" { - operands[4] - = GEN_INT (ENDIAN_LANE_N (V16QImode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4])); return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]"; } [(set_attr "type" "neon_dot")] @@ -502,7 +526,7 @@ (match_operand:VMUL 3 "register_operand" "w")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"; } [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")] @@ -518,8 +542,7 @@ (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2])); return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"; } [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] @@ -536,7 +559,7 @@ [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")] ) -(define_insn "aarch64_rsqrte<mode>" +(define_insn "@aarch64_rsqrte<mode>" [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w") (unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")] UNSPEC_RSQRTE))] @@ -544,7 +567,7 @@ "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" [(set_attr "type" "neon_fp_rsqrte_<stype><q>")]) -(define_insn "aarch64_rsqrts<mode>" +(define_insn "@aarch64_rsqrts<mode>" [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w") (unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w") (match_operand:VHSDF_HSDF 2 "register_operand" "w")] @@ -572,7 +595,7 @@ (match_operand:DF 3 "register_operand" "w")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2])); return "fmul\\t%0.2d, %3.2d, %1.d[%2]"; } [(set_attr "type" "neon_fp_mul_d_scalar_q")] @@ -617,6 +640,67 @@ [(set_attr "type" "neon_abd<q>")] ) +(define_insn "aarch64_<sur>abdl2<mode>_3" + [(set (match_operand:<VDBLW> 0 "register_operand" "=w") + (unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w") + (match_operand:VDQV_S 2 "register_operand" "w")] + ABDL2))] + "TARGET_SIMD" + "<sur>abdl2\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>" + [(set_attr "type" "neon_abd<q>")] +) + +(define_insn "aarch64_<sur>abal<mode>_4" + [(set (match_operand:<VDBLW> 0 "register_operand" "=w") + (unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w") + (match_operand:VDQV_S 2 "register_operand" "w") + (match_operand:<VDBLW> 3 "register_operand" "0")] + ABAL))] + "TARGET_SIMD" + "<sur>abal\t%0.<Vwtype>, %1.<Vhalftype>, %2.<Vhalftype>" + [(set_attr "type" "neon_arith_acc<q>")] +) + +(define_insn "aarch64_<sur>adalp<mode>_3" + [(set (match_operand:<VDBLW> 0 "register_operand" "=w") + (unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w") + (match_operand:<VDBLW> 2 "register_operand" "0")] + ADALP))] + "TARGET_SIMD" + "<sur>adalp\t%0.<Vwtype>, %1.<Vtype>" + [(set_attr "type" "neon_reduc_add<q>")] +) + +;; Emit a sequence to produce a sum-of-absolute-differences of the V16QI +;; inputs in operands 1 and 2. The sequence also has to perform a widening +;; reduction of the difference into a V4SI vector and accumulate that into +;; operand 3 before copying that into the result operand 0. +;; Perform that with a sequence of: +;; UABDL2 tmp.8h, op1.16b, op2.16b +;; UABAL tmp.8h, op1.16b, op2.16b +;; UADALP op3.4s, tmp.8h +;; MOV op0, op3 // should be eliminated in later passes. +;; The signed version just uses the signed variants of the above instructions. + +(define_expand "<sur>sadv16qi" + [(use (match_operand:V4SI 0 "register_operand")) + (unspec:V16QI [(use (match_operand:V16QI 1 "register_operand")) + (use (match_operand:V16QI 2 "register_operand"))] ABAL) + (use (match_operand:V4SI 3 "register_operand"))] + "TARGET_SIMD" + { + rtx reduc = gen_reg_rtx (V8HImode); + emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1], + operands[2])); + emit_insn (gen_aarch64_<sur>abalv16qi_4 (reduc, operands[1], + operands[2], reduc)); + emit_insn (gen_aarch64_<sur>adalpv8hi_3 (operands[3], reduc, + operands[3])); + emit_move_insn (operands[0], operands[3]); + DONE; + } +) + (define_insn "aba<mode>_3" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (plus:VDQ_BHSI (abs:VDQ_BHSI (minus:VDQ_BHSI @@ -651,8 +735,8 @@ case 0: return "and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>"; case 1: - return aarch64_output_simd_mov_immediate (operands[2], - <MODE>mode, GET_MODE_BITSIZE (<MODE>mode), AARCH64_CHECK_BIC); + return aarch64_output_simd_mov_immediate (operands[2], <bitsize>, + AARCH64_CHECK_BIC); default: gcc_unreachable (); } @@ -672,8 +756,8 @@ case 0: return "orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>"; case 1: - return aarch64_output_simd_mov_immediate (operands[2], - <MODE>mode, GET_MODE_BITSIZE (<MODE>mode), AARCH64_CHECK_ORR); + return aarch64_output_simd_mov_immediate (operands[2], <bitsize>, + AARCH64_CHECK_ORR); default: gcc_unreachable (); } @@ -699,29 +783,29 @@ ) (define_insn "aarch64_simd_vec_set<mode>" - [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w,w,w") - (vec_merge:VDQ_BHSI - (vec_duplicate:VDQ_BHSI - (match_operand:<VEL> 1 "aarch64_simd_general_operand" "r,w,Utv")) - (match_operand:VDQ_BHSI 3 "register_operand" "0,0,0") + [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w") + (vec_merge:VALL_F16 + (vec_duplicate:VALL_F16 + (match_operand:<VEL> 1 "aarch64_simd_general_operand" "w,?r,Utv")) + (match_operand:VALL_F16 3 "register_operand" "0,0,0") (match_operand:SI 2 "immediate_operand" "i,i,i")))] "TARGET_SIMD" { - int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2]))); + int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))); operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt); switch (which_alternative) { case 0: - return "ins\\t%0.<Vetype>[%p2], %w1"; + return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]"; case 1: - return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]"; + return "ins\\t%0.<Vetype>[%p2], %<vwcore>1"; case 2: return "ld1\\t{%0.<Vetype>}[%p2], %1"; default: gcc_unreachable (); } } - [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_one_lane<q>")] + [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")] ) (define_insn "*aarch64_simd_vec_copy_lane<mode>" @@ -736,9 +820,9 @@ (match_operand:SI 2 "immediate_operand" "i")))] "TARGET_SIMD" { - int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2]))); + int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))); operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt); - operands[4] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4])); return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; } @@ -757,10 +841,10 @@ (match_operand:SI 2 "immediate_operand" "i")))] "TARGET_SIMD" { - int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2]))); + int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2]))); operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt); - operands[4] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, + INTVAL (operands[4])); return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]"; } @@ -1035,19 +1119,6 @@ } ) -(define_expand "vec_set<mode>" - [(match_operand:VDQ_BHSI 0 "register_operand") - (match_operand:<VEL> 1 "register_operand") - (match_operand:SI 2 "immediate_operand")] - "TARGET_SIMD" - { - HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]); - emit_insn (gen_aarch64_simd_vec_set<mode> (operands[0], operands[1], - GEN_INT (elem), operands[0])); - DONE; - } -) - ;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero. (define_insn "vec_shr_<mode>" [(set (match_operand:VD 0 "register_operand" "=w") @@ -1064,62 +1135,8 @@ [(set_attr "type" "neon_shift_imm")] ) -(define_insn "aarch64_simd_vec_setv2di" - [(set (match_operand:V2DI 0 "register_operand" "=w,w") - (vec_merge:V2DI - (vec_duplicate:V2DI - (match_operand:DI 1 "register_operand" "r,w")) - (match_operand:V2DI 3 "register_operand" "0,0") - (match_operand:SI 2 "immediate_operand" "i,i")))] - "TARGET_SIMD" - { - int elt = ENDIAN_LANE_N (V2DImode, exact_log2 (INTVAL (operands[2]))); - operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt); - switch (which_alternative) - { - case 0: - return "ins\\t%0.d[%p2], %1"; - case 1: - return "ins\\t%0.d[%p2], %1.d[0]"; - default: - gcc_unreachable (); - } - } - [(set_attr "type" "neon_from_gp, neon_ins_q")] -) - -(define_expand "vec_setv2di" - [(match_operand:V2DI 0 "register_operand") - (match_operand:DI 1 "register_operand") - (match_operand:SI 2 "immediate_operand")] - "TARGET_SIMD" - { - HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]); - emit_insn (gen_aarch64_simd_vec_setv2di (operands[0], operands[1], - GEN_INT (elem), operands[0])); - DONE; - } -) - -(define_insn "aarch64_simd_vec_set<mode>" - [(set (match_operand:VDQF_F16 0 "register_operand" "=w") - (vec_merge:VDQF_F16 - (vec_duplicate:VDQF_F16 - (match_operand:<VEL> 1 "register_operand" "w")) - (match_operand:VDQF_F16 3 "register_operand" "0") - (match_operand:SI 2 "immediate_operand" "i")))] - "TARGET_SIMD" - { - int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2]))); - - operands[2] = GEN_INT ((HOST_WIDE_INT)1 << elt); - return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]"; - } - [(set_attr "type" "neon_ins<q>")] -) - (define_expand "vec_set<mode>" - [(match_operand:VDQF_F16 0 "register_operand" "+w") + [(match_operand:VALL_F16 0 "register_operand" "+w") (match_operand:<VEL> 1 "register_operand" "w") (match_operand:SI 2 "immediate_operand" "")] "TARGET_SIMD" @@ -1155,7 +1172,7 @@ (match_operand:VDQHS 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] @@ -1173,8 +1190,7 @@ (match_operand:VDQHS 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2])); return "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] @@ -1214,7 +1230,7 @@ (match_operand:VDQHS 3 "register_operand" "w"))))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] @@ -1232,8 +1248,7 @@ (match_operand:VDQHS 3 "register_operand" "w"))))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2])); return "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")] @@ -1337,9 +1352,8 @@ fmov\\t%d0, %1 dup\\t%d0, %1" [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>") - (set_attr "simd" "yes,*,yes") - (set_attr "fp" "*,yes,*") - (set_attr "length" "4")] + (set_attr "length" "4") + (set_attr "arch" "simd,fp,simd")] ) (define_insn "move_lo_quad_internal_<mode>" @@ -1353,9 +1367,8 @@ fmov\\t%d0, %1 dup\\t%d0, %1" [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>") - (set_attr "simd" "yes,*,yes") - (set_attr "fp" "*,yes,*") - (set_attr "length" "4")] + (set_attr "length" "4") + (set_attr "arch" "simd,fp,simd")] ) (define_insn "move_lo_quad_internal_be_<mode>" @@ -1369,9 +1382,8 @@ fmov\\t%d0, %1 dup\\t%d0, %1" [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>") - (set_attr "simd" "yes,*,yes") - (set_attr "fp" "*,yes,*") - (set_attr "length" "4")] + (set_attr "length" "4") + (set_attr "arch" "simd,fp,simd")] ) (define_insn "move_lo_quad_internal_be_<mode>" @@ -1385,9 +1397,8 @@ fmov\\t%d0, %1 dup\\t%d0, %1" [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>") - (set_attr "simd" "yes,*,yes") - (set_attr "fp" "*,yes,*") - (set_attr "length" "4")] + (set_attr "length" "4") + (set_attr "arch" "simd,fp,simd")] ) (define_expand "move_lo_quad_<mode>" @@ -1441,7 +1452,7 @@ (match_operand:<VHALF> 1 "register_operand" "")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); if (BYTES_BIG_ENDIAN) emit_insn (gen_aarch64_simd_move_hi_quad_be_<mode> (operands[0], operands[1], p)); @@ -1505,7 +1516,7 @@ (match_operand:VQW 2 "vect_par_cnst_lo_half" "") )))] "TARGET_SIMD" - "<su>shll\t%0.<Vwtype>, %1.<Vhalftype>, 0" + "<su>xtl\t%0.<Vwtype>, %1.<Vhalftype>" [(set_attr "type" "neon_shift_imm_long")] ) @@ -1516,7 +1527,7 @@ (match_operand:VQW 2 "vect_par_cnst_hi_half" "") )))] "TARGET_SIMD" - "<su>shll2\t%0.<Vwtype>, %1.<Vtype>, 0" + "<su>xtl2\t%0.<Vwtype>, %1.<Vtype>" [(set_attr "type" "neon_shift_imm_long")] ) @@ -1525,7 +1536,7 @@ (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_simd_vec_unpack<su>_hi_<mode> (operands[0], operands[1], p)); DONE; @@ -1537,7 +1548,7 @@ (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); emit_insn (gen_aarch64_simd_vec_unpack<su>_lo_<mode> (operands[0], operands[1], p)); DONE; @@ -1657,7 +1668,7 @@ (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); emit_insn (gen_aarch64_simd_vec_<su>mult_lo_<mode> (operands[0], operands[1], operands[2], p)); @@ -1684,7 +1695,7 @@ (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_simd_vec_<su>mult_hi_<mode> (operands[0], operands[1], operands[2], p)); @@ -1803,7 +1814,7 @@ (match_operand:VDQF 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] @@ -1820,8 +1831,7 @@ (match_operand:VDQSF 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2])); return "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] @@ -1849,7 +1859,7 @@ (match_operand:DF 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2])); return "fmla\\t%0.2d, %3.2d, %1.2d[%2]"; } [(set_attr "type" "neon_fp_mla_d_scalar_q")] @@ -1858,9 +1868,8 @@ (define_insn "fnma<mode>4" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF - (match_operand:VHSDF 1 "register_operand" "w") - (neg:VHSDF - (match_operand:VHSDF 2 "register_operand" "w")) + (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (match_operand:VHSDF 2 "register_operand" "w") (match_operand:VHSDF 3 "register_operand" "0")))] "TARGET_SIMD" "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" @@ -1879,7 +1888,7 @@ (match_operand:VDQF 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] @@ -1897,8 +1906,7 @@ (match_operand:VDQSF 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2])); return "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"; } [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")] @@ -1928,7 +1936,7 @@ (match_operand:DF 4 "register_operand" "0")))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2])); return "fmls\\t%0.2d, %3.2d, %1.2d[%2]"; } [(set_attr "type" "neon_fp_mla_d_scalar_q")] @@ -2090,7 +2098,7 @@ (match_operand:VQ_HSF 1 "register_operand" "")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0], operands[1], p)); DONE; @@ -2113,7 +2121,7 @@ (match_operand:VQ_HSF 1 "register_operand" "")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0], operands[1], p)); DONE; @@ -2221,8 +2229,9 @@ ;; Max/Min are introduced by idiom recognition by GCC's mid-end. An ;; expression like: ;; a = (b < c) ? b : c; -;; is idiom-matched as MIN_EXPR<b,c> only if -ffinite-math-only is enabled -;; either explicitly or indirectly via -ffast-math. +;; is idiom-matched as MIN_EXPR<b,c> only if -ffinite-math-only and +;; -fno-signed-zeros are enabled either explicitly or indirectly via +;; -ffast-math. ;; ;; MIN_EXPR and MAX_EXPR eventually map to 'smin' and 'smax' in RTL. ;; The 'smax' and 'smin' RTL standard pattern names do not specify which @@ -2261,7 +2270,7 @@ UNSPEC_ADDV)] "TARGET_SIMD" { - rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0)); + rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0); rtx scratch = gen_reg_rtx (<MODE>mode); emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1])); emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt)); @@ -2312,7 +2321,7 @@ UNSPEC_FADDV))] "TARGET_SIMD" { - rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0)); + rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0); rtx scratch = gen_reg_rtx (V4SFmode); emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1])); emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch)); @@ -2347,14 +2356,14 @@ ;; 'across lanes' max and min ops. ;; Template for outputting a scalar, so we can create __builtins which can be -;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code. (This is FP smax/smin). +;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function. (This is FP smax/smin). (define_expand "reduc_<maxmin_uns>_scal_<mode>" [(match_operand:<VEL> 0 "register_operand") (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")] FMAXMINV)] "TARGET_SIMD" { - rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0)); + rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0); rtx scratch = gen_reg_rtx (<MODE>mode); emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch, operands[1])); @@ -2370,7 +2379,7 @@ MAXMINV)] "TARGET_SIMD" { - rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0)); + rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0); rtx scratch = gen_reg_rtx (<MODE>mode); emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch, operands[1])); @@ -2427,13 +2436,13 @@ ;; in *aarch64_simd_bsl<mode>_alt. (define_insn "aarch64_simd_bsl<mode>_internal" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I (match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) + (match_operand:VDQ_I 2 "register_operand" "w,w,0")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) (match_dup:<V_INT_EQUIV> 3) ))] "TARGET_SIMD" @@ -2451,14 +2460,14 @@ ;; permutations of commutative operations, we have to have a separate pattern. (define_insn "*aarch64_simd_bsl<mode>_alt" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI - (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) - (match_dup:VSDQ_I_DI 2)))] + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I + (match_operand:VDQ_I 3 "register_operand" "w,w,0") + (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) + (match_dup:<V_INT_EQUIV> 2)))] "TARGET_SIMD" "@ bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype> @@ -2467,6 +2476,100 @@ [(set_attr "type" "neon_bsl<q>")] ) +;; DImode is special, we want to avoid computing operations which are +;; more naturally computed in general purpose registers in the vector +;; registers. If we do that, we need to move all three operands from general +;; purpose registers to vector registers, then back again. However, we +;; don't want to make this pattern an UNSPEC as we'd lose scope for +;; optimizations based on the component operations of a BSL. +;; +;; That means we need a splitter back to the individual operations, if they +;; would be better calculated on the integer side. + +(define_insn_and_split "aarch64_simd_bsldi_internal" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,0,w,r") + (match_operand:DI 2 "register_operand" "w,w,0,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 3) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %2.8b, %3.8b + bit\\t%0.8b, %2.8b, %1.8b + bif\\t%0.8b, %3.8b, %1.8b + #" + "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. If we're before reload, and + able to create a temporary register, do so. If we're after reload, + we've got an early-clobber destination register, so use that. + Otherwise, we can't create pseudos and we can't yet guarantee that + operands[0] is safe to write, so FAIL to split. */ + + rtx scratch; + if (reload_completed) + scratch = operands[0]; + else if (can_create_pseudo_p ()) + scratch = gen_reg_rtx (DImode); + else + FAIL; + + emit_insn (gen_xordi3 (scratch, operands[2], operands[3])); + emit_insn (gen_anddi3 (scratch, scratch, operands[1])); + emit_insn (gen_xordi3 (operands[0], scratch, operands[3])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + +(define_insn_and_split "aarch64_simd_bsldi_alt" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,w,0,r") + (match_operand:DI 2 "register_operand" "w,0,w,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 2) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %3.8b, %2.8b + bit\\t%0.8b, %3.8b, %1.8b + bif\\t%0.8b, %2.8b, %1.8b + #" + "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. If we're before reload, and + able to create a temporary register, do so. If we're after reload, + we've got an early-clobber destination register, so use that. + Otherwise, we can't create pseudos and we can't yet guarantee that + operands[0] is safe to write, so FAIL to split. */ + + rtx scratch; + if (reload_completed) + scratch = operands[0]; + else if (can_create_pseudo_p ()) + scratch = gen_reg_rtx (DImode); + else + FAIL; + + emit_insn (gen_xordi3 (scratch, operands[2], operands[3])); + emit_insn (gen_anddi3 (scratch, scratch, operands[1])); + emit_insn (gen_xordi3 (operands[0], scratch, operands[2])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + (define_expand "aarch64_simd_bsl<mode>" [(match_operand:VALLDIF 0 "register_operand") (match_operand:<V_INT_EQUIV> 1 "register_operand") @@ -2646,10 +2749,10 @@ break; } /* Fall through. */ - case UNGE: + case UNLT: std::swap (operands[2], operands[3]); /* Fall through. */ - case UNLE: + case UNGT: case GT: comparison = gen_aarch64_cmgt<mode>; break; @@ -2660,10 +2763,10 @@ break; } /* Fall through. */ - case UNGT: + case UNLE: std::swap (operands[2], operands[3]); /* Fall through. */ - case UNLT: + case UNGE: case GE: comparison = gen_aarch64_cmge<mode>; break; @@ -2674,6 +2777,7 @@ case UNEQ: case ORDERED: case UNORDERED: + case LTGT: break; default: gcc_unreachable (); @@ -2685,21 +2789,41 @@ case UNGT: case UNLE: case UNLT: - case NE: - /* FCM returns false for lanes which are unordered, so if we use - the inverse of the comparison we actually want to emit, then - invert the result, we will end up with the correct result. - Note that a NE NaN and NaN NE b are true for all a, b. - - Our transformations are: - a UNGE b -> !(b GT a) - a UNGT b -> !(b GE a) - a UNLE b -> !(a GT b) - a UNLT b -> !(a GE b) - a NE b -> !(a EQ b) */ - gcc_assert (comparison != NULL); - emit_insn (comparison (operands[0], operands[2], operands[3])); - emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0])); + { + /* All of the above must not raise any FP exceptions. Thus we first + check each operand for NaNs and force any elements containing NaN to + zero before using them in the compare. + Example: UN<cc> (a, b) -> UNORDERED (a, b) | + (cm<cc> (isnan (a) ? 0.0 : a, + isnan (b) ? 0.0 : b)) + We use the following transformations for doing the comparisions: + a UNGE b -> a GE b + a UNGT b -> a GT b + a UNLE b -> b GE a + a UNLT b -> b GT a. */ + + rtx tmp0 = gen_reg_rtx (<V_INT_EQUIV>mode); + rtx tmp1 = gen_reg_rtx (<V_INT_EQUIV>mode); + rtx tmp2 = gen_reg_rtx (<V_INT_EQUIV>mode); + emit_insn (gen_aarch64_cmeq<mode> (tmp0, operands[2], operands[2])); + emit_insn (gen_aarch64_cmeq<mode> (tmp1, operands[3], operands[3])); + emit_insn (gen_and<v_int_equiv>3 (tmp2, tmp0, tmp1)); + emit_insn (gen_and<v_int_equiv>3 (tmp0, tmp0, + lowpart_subreg (<V_INT_EQUIV>mode, + operands[2], + <MODE>mode))); + emit_insn (gen_and<v_int_equiv>3 (tmp1, tmp1, + lowpart_subreg (<V_INT_EQUIV>mode, + operands[3], + <MODE>mode))); + gcc_assert (comparison != NULL); + emit_insn (comparison (operands[0], + lowpart_subreg (<MODE>mode, + tmp0, <V_INT_EQUIV>mode), + lowpart_subreg (<MODE>mode, + tmp1, <V_INT_EQUIV>mode))); + emit_insn (gen_orn<v_int_equiv>3 (operands[0], tmp2, operands[0])); + } break; case LT: @@ -2707,42 +2831,46 @@ case GT: case GE: case EQ: + case NE: /* The easy case. Here we emit one of FCMGE, FCMGT or FCMEQ. As a LT b <=> b GE a && a LE b <=> b GT a. Our transformations are: a GE b -> a GE b a GT b -> a GT b a LE b -> b GE a a LT b -> b GT a - a EQ b -> a EQ b */ + a EQ b -> a EQ b + a NE b -> ~(a EQ b) */ gcc_assert (comparison != NULL); emit_insn (comparison (operands[0], operands[2], operands[3])); + if (code == NE) + emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0])); break; - case UNEQ: - /* We first check (a > b || b > a) which is !UNEQ, inverting - this result will then give us (a == b || a UNORDERED b). */ + case LTGT: + /* LTGT is not guranteed to not generate a FP exception. So let's + go the faster way : ((a > b) || (b > a)). */ emit_insn (gen_aarch64_cmgt<mode> (operands[0], operands[2], operands[3])); emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[3], operands[2])); emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp)); - emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0])); - break; - - case UNORDERED: - /* Operands are ORDERED iff (a > b || b >= a), so we can compute - UNORDERED as !ORDERED. */ - emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[2], operands[3])); - emit_insn (gen_aarch64_cmge<mode> (operands[0], - operands[3], operands[2])); - emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp)); - emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0])); break; case ORDERED: - emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[2], operands[3])); - emit_insn (gen_aarch64_cmge<mode> (operands[0], - operands[3], operands[2])); - emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp)); + case UNORDERED: + case UNEQ: + /* cmeq (a, a) & cmeq (b, b). */ + emit_insn (gen_aarch64_cmeq<mode> (operands[0], + operands[2], operands[2])); + emit_insn (gen_aarch64_cmeq<mode> (tmp, operands[3], operands[3])); + emit_insn (gen_and<v_int_equiv>3 (operands[0], operands[0], tmp)); + + if (code == UNORDERED) + emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0])); + else if (code == UNEQ) + { + emit_insn (gen_aarch64_cmeq<mode> (tmp, operands[2], operands[3])); + emit_insn (gen_orn<v_int_equiv>3 (operands[0], operands[0], tmp)); + } break; default: @@ -2895,37 +3023,38 @@ (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]"; } - [(set_attr "type" "neon_to_gp<q>")] -) - -(define_insn "*aarch64_get_lane_zero_extendsi<mode>" - [(set (match_operand:SI 0 "register_operand" "=r") - (zero_extend:SI - (vec_select:<VEL> - (match_operand:VDQQH 1 "register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] - "TARGET_SIMD" - { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); - return "umov\\t%w0, %1.<Vetype>[%2]"; - } - [(set_attr "type" "neon_to_gp<q>")] + [(set_attr "type" "neon_to_gp<q>")] +) + +(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>" + [(set (match_operand:GPI 0 "register_operand" "=r") + (zero_extend:GPI + (vec_select:<VEL> + (match_operand:VDQQH 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] + "TARGET_SIMD" + { + operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode, + INTVAL (operands[2])); + return "umov\\t%w0, %1.<Vetype>[%2]"; + } + [(set_attr "type" "neon_to_gp<q>")] ) ;; Lane extraction of a value, neither sign nor zero extension ;; is guaranteed so upper bits should be considered undefined. ;; RTL uses GCC vector extension indices throughout so flip only for assembly. (define_insn "aarch64_get_lane<mode>" - [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv") + [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv") (vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w, w, w") (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); switch (which_alternative) { case 0: @@ -2941,37 +3070,61 @@ [(set_attr "type" "neon_to_gp<q>, neon_dup<q>, neon_store1_one_lane<q>")] ) +(define_insn "load_pair_lanes<mode>" + [(set (match_operand:<VDBL> 0 "register_operand" "=w") + (vec_concat:<VDBL> + (match_operand:VDC 1 "memory_operand" "Utq") + (match_operand:VDC 2 "memory_operand" "m")))] + "TARGET_SIMD && !STRICT_ALIGNMENT + && rtx_equal_p (XEXP (operands[2], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (<MODE>mode)))" + "ldr\\t%q0, %1" + [(set_attr "type" "neon_load1_1reg_q")] +) + +(define_insn "store_pair_lanes<mode>" + [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn") + (vec_concat:<VDBL> + (match_operand:VDC 1 "register_operand" "w, r") + (match_operand:VDC 2 "register_operand" "w, r")))] + "TARGET_SIMD" + "@ + stp\\t%d1, %d2, %y0 + stp\\t%x1, %x2, %y0" + [(set_attr "type" "neon_stp, store_16")] +) + ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. (define_insn "*aarch64_combinez<mode>" [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w") - (vec_concat:<VDBL> - (match_operand:VD_BHSI 1 "general_operand" "w,?r,m") - (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))] + (vec_concat:<VDBL> + (match_operand:VDC 1 "general_operand" "w,?r,m") + (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" "@ mov\\t%0.8b, %1.8b fmov\t%d0, %1 ldr\\t%d0, %1" [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg") - (set_attr "simd" "yes,*,yes") - (set_attr "fp" "*,yes,*")] + (set_attr "arch" "simd,fp,simd")] ) (define_insn "*aarch64_combinez_be<mode>" [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w") (vec_concat:<VDBL> - (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz") - (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")))] + (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero") + (match_operand:VDC 1 "general_operand" "w,?r,m")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" "@ mov\\t%0.8b, %1.8b fmov\t%d0, %1 ldr\\t%d0, %1" [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg") - (set_attr "simd" "yes,*,yes") - (set_attr "fp" "*,yes,*")] + (set_attr "arch" "simd,fp,simd")] ) (define_expand "aarch64_combine<mode>" @@ -2986,7 +3139,7 @@ } ) -(define_expand "aarch64_simd_combine<mode>" +(define_expand "@aarch64_simd_combine<mode>" [(match_operand:<VDBL> 0 "register_operand") (match_operand:VDC 1 "register_operand") (match_operand:VDC 2 "register_operand")] @@ -3034,7 +3187,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_saddl<mode>_hi_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3046,7 +3199,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_uaddl<mode>_hi_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3058,7 +3211,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_ssubl<mode>_hi_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3070,7 +3223,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_usubl<mode>_hi_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3096,7 +3249,7 @@ (match_operand:<VDBLW> 2 "register_operand" "")))] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); rtx temp = gen_reg_rtx (GET_MODE (operands[0])); emit_insn (gen_aarch64_saddw<mode>_internal (temp, operands[2], @@ -3124,7 +3277,7 @@ (match_operand:<VDBLW> 2 "register_operand" "")))] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false); rtx temp = gen_reg_rtx (GET_MODE (operands[0])); emit_insn (gen_aarch64_uaddw<mode>_internal (temp, operands[2], @@ -3145,38 +3298,74 @@ DONE; }) -(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>" +(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w") + (ANY_EXTEND:<VWIDE> + (match_operand:VD_BHSI 2 "register_operand" "w"))))] + "TARGET_SIMD" + "<ANY_EXTEND:su>subw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>" + [(set_attr "type" "neon_sub_widen")] +) + +(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>_internal" [(set (match_operand:<VWIDE> 0 "register_operand" "=w") - (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w") - (ANY_EXTEND:<VWIDE> - (match_operand:VD_BHSI 2 "register_operand" "w"))))] - "TARGET_SIMD" - "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>" - [(set_attr "type" "neon_<ADDSUB:optab>_widen")] -) - -(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>_internal" + (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w") + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQW 2 "register_operand" "w") + (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))] + "TARGET_SIMD" + "<ANY_EXTEND:su>subw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>" + [(set_attr "type" "neon_sub_widen")] +) + +(define_insn "aarch64_<ANY_EXTEND:su>subw2<mode>_internal" [(set (match_operand:<VWIDE> 0 "register_operand" "=w") - (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w") - (ANY_EXTEND:<VWIDE> - (vec_select:<VHALF> - (match_operand:VQW 2 "register_operand" "w") - (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))] - "TARGET_SIMD" - "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>" - [(set_attr "type" "neon_<ADDSUB:optab>_widen")] -) - -(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w2<mode>_internal" + (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w") + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQW 2 "register_operand" "w") + (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))))] + "TARGET_SIMD" + "<ANY_EXTEND:su>subw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>" + [(set_attr "type" "neon_sub_widen")] +) + +(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (plus:<VWIDE> + (ANY_EXTEND:<VWIDE> (match_operand:VD_BHSI 2 "register_operand" "w")) + (match_operand:<VWIDE> 1 "register_operand" "w")))] + "TARGET_SIMD" + "<ANY_EXTEND:su>addw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>" + [(set_attr "type" "neon_add_widen")] +) + +(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>_internal" [(set (match_operand:<VWIDE> 0 "register_operand" "=w") - (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w") - (ANY_EXTEND:<VWIDE> - (vec_select:<VHALF> - (match_operand:VQW 2 "register_operand" "w") - (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))))] - "TARGET_SIMD" - "<ANY_EXTEND:su><ADDSUB:optab>w2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>" - [(set_attr "type" "neon_<ADDSUB:optab>_widen")] + (plus:<VWIDE> + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQW 2 "register_operand" "w") + (match_operand:VQW 3 "vect_par_cnst_lo_half" ""))) + (match_operand:<VWIDE> 1 "register_operand" "w")))] + "TARGET_SIMD" + "<ANY_EXTEND:su>addw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>" + [(set_attr "type" "neon_add_widen")] +) + +(define_insn "aarch64_<ANY_EXTEND:su>addw2<mode>_internal" + [(set (match_operand:<VWIDE> 0 "register_operand" "=w") + (plus:<VWIDE> + (ANY_EXTEND:<VWIDE> + (vec_select:<VHALF> + (match_operand:VQW 2 "register_operand" "w") + (match_operand:VQW 3 "vect_par_cnst_hi_half" ""))) + (match_operand:<VWIDE> 1 "register_operand" "w")))] + "TARGET_SIMD" + "<ANY_EXTEND:su>addw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>" + [(set_attr "type" "neon_add_widen")] ) (define_expand "aarch64_saddw2<mode>" @@ -3185,7 +3374,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_saddw2<mode>_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3197,7 +3386,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_uaddw2<mode>_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3210,7 +3399,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_ssubw2<mode>_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3222,7 +3411,7 @@ (match_operand:VQW 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_usubw2<mode>_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -3230,6 +3419,22 @@ ;; <su><r>h<addsub>. +(define_expand "<u>avg<mode>3_floor" + [(set (match_operand:VDQ_BHSI 0 "register_operand") + (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand") + (match_operand:VDQ_BHSI 2 "register_operand")] + HADD))] + "TARGET_SIMD" +) + +(define_expand "<u>avg<mode>3_ceil" + [(set (match_operand:VDQ_BHSI 0 "register_operand") + (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand") + (match_operand:VDQ_BHSI 2 "register_operand")] + RHADD))] + "TARGET_SIMD" +) + (define_insn "aarch64_<sur>h<addsub><mode>" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand" "w") @@ -3301,8 +3506,7 @@ UNSPEC_FMULX))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode, - INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[3])); return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_fp_mul_<Vetype>_scalar<q>")] @@ -3321,7 +3525,7 @@ UNSPEC_FMULX))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3])); return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_fp_mul_<Vetype><q>")] @@ -3355,7 +3559,7 @@ UNSPEC_FMULX))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3])); return "fmulx\t%<Vetype>0, %<Vetype>1, %2.<Vetype>[%3]"; } [(set_attr "type" "fmul<Vetype>")] @@ -3441,7 +3645,7 @@ VQDMULH))] "TARGET_SIMD" "* - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3])); return \"sq<r>dmulh\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]\";" [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")] ) @@ -3456,7 +3660,7 @@ VQDMULH))] "TARGET_SIMD" "* - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3])); return \"sq<r>dmulh\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]\";" [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")] ) @@ -3471,7 +3675,7 @@ VQDMULH))] "TARGET_SIMD" "* - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3])); return \"sq<r>dmulh\\t%<v>0, %<v>1, %2.<v>[%3]\";" [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")] ) @@ -3486,7 +3690,7 @@ VQDMULH))] "TARGET_SIMD" "* - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3])); return \"sq<r>dmulh\\t%<v>0, %<v>1, %2.<v>[%3]\";" [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")] ) @@ -3518,7 +3722,7 @@ SQRDMLH_AS))] "TARGET_SIMD_RDMA" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4])); return "sqrdml<SQRDMLH_AS:rdma_as>h\\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[%4]"; } @@ -3536,7 +3740,7 @@ SQRDMLH_AS))] "TARGET_SIMD_RDMA" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4])); return "sqrdml<SQRDMLH_AS:rdma_as>h\\t%<v>0, %<v>2, %3.<Vetype>[%4]"; } @@ -3556,7 +3760,7 @@ SQRDMLH_AS))] "TARGET_SIMD_RDMA" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4])); return "sqrdml<SQRDMLH_AS:rdma_as>h\\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[%4]"; } @@ -3574,7 +3778,7 @@ SQRDMLH_AS))] "TARGET_SIMD_RDMA" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4])); return "sqrdml<SQRDMLH_AS:rdma_as>h\\t%<v>0, %<v>2, %3.<v>[%4]"; } @@ -3618,7 +3822,7 @@ (const_int 1))))] "TARGET_SIMD" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4])); return "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]"; } @@ -3642,7 +3846,7 @@ (const_int 1))))] "TARGET_SIMD" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4])); return "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]"; } @@ -3665,7 +3869,7 @@ (const_int 1))))] "TARGET_SIMD" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4])); return "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]"; } @@ -3688,7 +3892,7 @@ (const_int 1))))] "TARGET_SIMD" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4])); return "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]"; } @@ -3743,7 +3947,7 @@ (match_operand:VQ_HSI 3 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlal2<mode>_internal (operands[0], operands[1], operands[2], operands[3], p)); DONE; @@ -3756,7 +3960,7 @@ (match_operand:VQ_HSI 3 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlsl2<mode>_internal (operands[0], operands[1], operands[2], operands[3], p)); DONE; @@ -3783,7 +3987,7 @@ (const_int 1))))] "TARGET_SIMD" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4])); return "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]"; } @@ -3809,7 +4013,7 @@ (const_int 1))))] "TARGET_SIMD" { - operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4]))); + operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4])); return "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]"; } @@ -3824,7 +4028,7 @@ (match_operand:SI 4 "immediate_operand" "i")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlal2_lane<mode>_internal (operands[0], operands[1], operands[2], operands[3], operands[4], p)); @@ -3839,7 +4043,7 @@ (match_operand:SI 4 "immediate_operand" "i")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlal2_laneq<mode>_internal (operands[0], operands[1], operands[2], operands[3], operands[4], p)); @@ -3854,7 +4058,7 @@ (match_operand:SI 4 "immediate_operand" "i")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlsl2_lane<mode>_internal (operands[0], operands[1], operands[2], operands[3], operands[4], p)); @@ -3869,7 +4073,7 @@ (match_operand:SI 4 "immediate_operand" "i")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlsl2_laneq<mode>_internal (operands[0], operands[1], operands[2], operands[3], operands[4], p)); @@ -3902,7 +4106,7 @@ (match_operand:<VEL> 3 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlal2_n<mode>_internal (operands[0], operands[1], operands[2], operands[3], p)); @@ -3916,7 +4120,7 @@ (match_operand:<VEL> 3 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmlsl2_n<mode>_internal (operands[0], operands[1], operands[2], operands[3], p)); @@ -3956,7 +4160,7 @@ (const_int 1)))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3])); return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")] @@ -3977,7 +4181,7 @@ (const_int 1)))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3])); return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")] @@ -3997,7 +4201,7 @@ (const_int 1)))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3])); return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")] @@ -4017,7 +4221,7 @@ (const_int 1)))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3])); return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")] @@ -4070,7 +4274,7 @@ (match_operand:VQ_HSI 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmull2<mode>_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -4095,7 +4299,7 @@ (const_int 1)))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3])); return "sqdmull2\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")] @@ -4118,7 +4322,7 @@ (const_int 1)))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3])); return "sqdmull2\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]"; } [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")] @@ -4131,7 +4335,7 @@ (match_operand:SI 3 "immediate_operand" "i")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmull2_lane<mode>_internal (operands[0], operands[1], operands[2], operands[3], p)); @@ -4145,7 +4349,7 @@ (match_operand:SI 3 "immediate_operand" "i")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmull2_laneq<mode>_internal (operands[0], operands[1], operands[2], operands[3], p)); @@ -4178,7 +4382,7 @@ (match_operand:<VEL> 2 "register_operand" "w")] "TARGET_SIMD" { - rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true); + rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true); emit_insn (gen_aarch64_sqdmull2_n<mode>_internal (operands[0], operands[1], operands[2], p)); DONE; @@ -4343,7 +4547,7 @@ (clobber (reg:CC CC_REGNUM))] "TARGET_SIMD" "#" - "reload_completed" + "&& reload_completed" [(set (match_operand:DI 0 "register_operand") (neg:DI (COMPARISONS:DI @@ -4406,7 +4610,7 @@ (clobber (reg:CC CC_REGNUM))] "TARGET_SIMD" "#" - "reload_completed" + "&& reload_completed" [(set (match_operand:DI 0 "register_operand") (neg:DI (UCOMPARISONS:DI @@ -4477,7 +4681,7 @@ (clobber (reg:CC CC_REGNUM))] "TARGET_SIMD" "#" - "reload_completed" + "&& reload_completed" [(set (match_operand:DI 0 "register_operand") (neg:DI (ne:DI @@ -4624,7 +4828,7 @@ UNSPEC_LD2_LANE))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3])); return "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"; } [(set_attr "type" "neon_load2_one_lane")] @@ -4640,7 +4844,7 @@ if (BYTES_BIG_ENDIAN) { rtx tmp = gen_reg_rtx (OImode); - rtx mask = aarch64_reverse_mask (<MODE>mode); + rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>); emit_insn (gen_aarch64_simd_ld2<mode> (tmp, operands[1])); emit_insn (gen_aarch64_rev_reglistoi (operands[0], tmp, mask)); } @@ -4668,7 +4872,7 @@ UNSPEC_ST2_LANE))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "st2\\t{%S1.<Vetype> - %T1.<Vetype>}[%2], %0"; } [(set_attr "type" "neon_store2_one_lane<q>")] @@ -4684,7 +4888,7 @@ if (BYTES_BIG_ENDIAN) { rtx tmp = gen_reg_rtx (OImode); - rtx mask = aarch64_reverse_mask (<MODE>mode); + rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>); emit_insn (gen_aarch64_rev_reglistoi (tmp, operands[1], mask)); emit_insn (gen_aarch64_simd_st2<mode> (operands[0], tmp)); } @@ -4722,7 +4926,7 @@ UNSPEC_LD3_LANE))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3])); return "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"; } [(set_attr "type" "neon_load3_one_lane")] @@ -4738,7 +4942,7 @@ if (BYTES_BIG_ENDIAN) { rtx tmp = gen_reg_rtx (CImode); - rtx mask = aarch64_reverse_mask (<MODE>mode); + rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>); emit_insn (gen_aarch64_simd_ld3<mode> (tmp, operands[1])); emit_insn (gen_aarch64_rev_reglistci (operands[0], tmp, mask)); } @@ -4766,7 +4970,7 @@ UNSPEC_ST3_LANE))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "st3\\t{%S1.<Vetype> - %U1.<Vetype>}[%2], %0"; } [(set_attr "type" "neon_store3_one_lane<q>")] @@ -4782,7 +4986,7 @@ if (BYTES_BIG_ENDIAN) { rtx tmp = gen_reg_rtx (CImode); - rtx mask = aarch64_reverse_mask (<MODE>mode); + rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>); emit_insn (gen_aarch64_rev_reglistci (tmp, operands[1], mask)); emit_insn (gen_aarch64_simd_st3<mode> (operands[0], tmp)); } @@ -4820,7 +5024,7 @@ UNSPEC_LD4_LANE))] "TARGET_SIMD" { - operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3]))); + operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3])); return "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"; } [(set_attr "type" "neon_load4_one_lane")] @@ -4836,7 +5040,7 @@ if (BYTES_BIG_ENDIAN) { rtx tmp = gen_reg_rtx (XImode); - rtx mask = aarch64_reverse_mask (<MODE>mode); + rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>); emit_insn (gen_aarch64_simd_ld4<mode> (tmp, operands[1])); emit_insn (gen_aarch64_rev_reglistxi (operands[0], tmp, mask)); } @@ -4864,7 +5068,7 @@ UNSPEC_ST4_LANE))] "TARGET_SIMD" { - operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); + operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2])); return "st4\\t{%S1.<Vetype> - %V1.<Vetype>}[%2], %0"; } [(set_attr "type" "neon_store4_one_lane<q>")] @@ -4880,7 +5084,7 @@ if (BYTES_BIG_ENDIAN) { rtx tmp = gen_reg_rtx (XImode); - rtx mask = aarch64_reverse_mask (<MODE>mode); + rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>); emit_insn (gen_aarch64_rev_reglistxi (tmp, operands[1], mask)); emit_insn (gen_aarch64_simd_st4<mode> (operands[0], tmp)); } @@ -4928,6 +5132,70 @@ } }) + +(define_expand "aarch64_ld1x3<VALLDIF:mode>" + [(match_operand:CI 0 "register_operand" "=w") + (match_operand:DI 1 "register_operand" "r") + (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + rtx mem = gen_rtx_MEM (CImode, operands[1]); + emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem)); + DONE; +}) + +(define_insn "aarch64_ld1_x3_<mode>" + [(set (match_operand:CI 0 "register_operand" "=w") + (unspec:CI + [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv") + (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))] + "TARGET_SIMD" + "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1" + [(set_attr "type" "neon_load1_3reg<q>")] +) + +(define_expand "aarch64_st1x2<VALLDIF:mode>" + [(match_operand:DI 0 "register_operand" "") + (match_operand:OI 1 "register_operand" "") + (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + rtx mem = gen_rtx_MEM (OImode, operands[0]); + emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1])); + DONE; +}) + +(define_insn "aarch64_st1_x2_<mode>" + [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:OI + [(match_operand:OI 1 "register_operand" "w") + (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))] + "TARGET_SIMD" + "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0" + [(set_attr "type" "neon_store1_2reg<q>")] +) + +(define_expand "aarch64_st1x3<VALLDIF:mode>" + [(match_operand:DI 0 "register_operand" "") + (match_operand:CI 1 "register_operand" "") + (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + rtx mem = gen_rtx_MEM (CImode, operands[0]); + emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1])); + DONE; +}) + +(define_insn "aarch64_st1_x3_<mode>" + [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv") + (unspec:CI + [(match_operand:CI 1 "register_operand" "w") + (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))] + "TARGET_SIMD" + "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0" + [(set_attr "type" "neon_store1_3reg<q>")] +) + (define_insn "*aarch64_mov<mode>" [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w") (match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))] @@ -5176,6 +5444,33 @@ DONE; }) +(define_expand "aarch64_ld1x2<VQ:mode>" + [(match_operand:OI 0 "register_operand" "=w") + (match_operand:DI 1 "register_operand" "r") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + machine_mode mode = OImode; + rtx mem = gen_rtx_MEM (mode, operands[1]); + + emit_insn (gen_aarch64_simd_ld1<VQ:mode>_x2 (operands[0], mem)); + DONE; +}) + +(define_expand "aarch64_ld1x2<VDC:mode>" + [(match_operand:OI 0 "register_operand" "=w") + (match_operand:DI 1 "register_operand" "r") + (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + "TARGET_SIMD" +{ + machine_mode mode = OImode; + rtx mem = gen_rtx_MEM (mode, operands[1]); + + emit_insn (gen_aarch64_simd_ld1<VDC:mode>_x2 (operands[0], mem)); + DONE; +}) + + (define_expand "aarch64_ld<VSTRUCT:nregs>_lane<VALLDIF:mode>" [(match_operand:VSTRUCT 0 "register_operand" "=w") (match_operand:DI 1 "register_operand" "w") @@ -5188,9 +5483,7 @@ set_mem_size (mem, GET_MODE_SIZE (GET_MODE_INNER (<VALLDIF:MODE>mode)) * <VSTRUCT:nregs>); - aarch64_simd_lane_bounds (operands[3], 0, - GET_MODE_NUNITS (<VALLDIF:MODE>mode), - NULL); + aarch64_simd_lane_bounds (operands[3], 0, <VALLDIF:nunits>, NULL); emit_insn (gen_aarch64_vec_load_lanes<VSTRUCT:mode>_lane<VALLDIF:mode> ( operands[0], mem, operands[2], operands[3])); DONE; @@ -5238,20 +5531,6 @@ ;; vec_perm support -(define_expand "vec_perm_const<mode>" - [(match_operand:VALL_F16 0 "register_operand") - (match_operand:VALL_F16 1 "register_operand") - (match_operand:VALL_F16 2 "register_operand") - (match_operand:<V_INT_EQUIV> 3)] - "TARGET_SIMD" -{ - if (aarch64_expand_vec_perm_const (operands[0], operands[1], - operands[2], operands[3])) - DONE; - else - FAIL; -}) - (define_expand "vec_perm<mode>" [(match_operand:VB 0 "register_operand") (match_operand:VB 1 "register_operand") @@ -5260,7 +5539,7 @@ "TARGET_SIMD" { aarch64_expand_vec_perm (operands[0], operands[1], - operands[2], operands[3]); + operands[2], operands[3], <nunits>); DONE; }) @@ -5369,6 +5648,9 @@ [(set_attr "type" "multiple")] ) +;; This instruction's pattern is generated directly by +;; aarch64_expand_vec_perm_const, so any changes to the pattern would +;; need corresponding changes there. (define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w") @@ -5379,7 +5661,10 @@ [(set_attr "type" "neon_permute<q>")] ) -;; Note immediate (third) operand is lane index not byte index. +;; This instruction's pattern is generated directly by +;; aarch64_expand_vec_perm_const, so any changes to the pattern would +;; need corresponding changes there. Note that the immediate (third) +;; operand is a lane index not a byte index. (define_insn "aarch64_ext<mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w") @@ -5395,6 +5680,9 @@ [(set_attr "type" "neon_ext<q>")] ) +;; This instruction's pattern is generated directly by +;; aarch64_expand_vec_perm_const, so any changes to the pattern would +;; need corresponding changes there. (define_insn "aarch64_rev<REVERSE:rev_op><mode>" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")] @@ -5563,25 +5851,47 @@ [(set_attr "type" "neon_load1_all_lanes")] ) -(define_insn "aarch64_frecpe<mode>" - [(set (match_operand:VHSDF 0 "register_operand" "=w") - (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")] +(define_insn "aarch64_simd_ld1<mode>_x2" + [(set (match_operand:OI 0 "register_operand" "=w") + (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv") + (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_LD1))] + "TARGET_SIMD" + "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1" + [(set_attr "type" "neon_load1_2reg<q>")] +) + +(define_insn "aarch64_simd_ld1<mode>_x2" + [(set (match_operand:OI 0 "register_operand" "=w") + (unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv") + (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_LD1))] + "TARGET_SIMD" + "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1" + [(set_attr "type" "neon_load1_2reg<q>")] +) + + +(define_insn "@aarch64_frecpe<mode>" + [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w") + (unspec:VHSDF_HSDF + [(match_operand:VHSDF_HSDF 1 "register_operand" "w")] UNSPEC_FRECPE))] "TARGET_SIMD" - "frecpe\\t%0.<Vtype>, %1.<Vtype>" + "frecpe\t%<v>0<Vmtype>, %<v>1<Vmtype>" [(set_attr "type" "neon_fp_recpe_<stype><q>")] ) -(define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>" +(define_insn "aarch64_frecpx<mode>" [(set (match_operand:GPF_F16 0 "register_operand" "=w") (unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")] - FRECP))] - "TARGET_SIMD" - "frecp<FRECP:frecp_suffix>\\t%<s>0, %<s>1" - [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF_F16:stype>")] -) - -(define_insn "aarch64_frecps<mode>" + UNSPEC_FRECPX))] + "TARGET_SIMD" + "frecpx\t%<s>0, %<s>1" + [(set_attr "type" "neon_fp_recpx_<GPF_F16:stype>")] +) + +(define_insn "@aarch64_frecps<mode>" [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w") (unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w") @@ -5617,10 +5927,33 @@ (define_insn "aarch64_crypto_aes<aes_op>v16qi" [(set (match_operand:V16QI 0 "register_operand" "=w") - (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%0") (match_operand:V16QI 2 "register_operand" "w")] CRYPTO_AES))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_AES" + "aes<aes_op>\\t%0.16b, %2.16b" + [(set_attr "type" "crypto_aese")] +) + +(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w")) + (match_operand:V16QI 3 "aarch64_simd_imm_zero" "")] + CRYPTO_AES))] + "TARGET_SIMD && TARGET_AES" + "aes<aes_op>\\t%0.16b, %2.16b" + [(set_attr "type" "crypto_aese")] +) + +(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI [(match_operand:V16QI 3 "aarch64_simd_imm_zero" "") + (xor:V16QI (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + CRYPTO_AES))] + "TARGET_SIMD && TARGET_AES" "aes<aes_op>\\t%0.16b, %2.16b" [(set_attr "type" "crypto_aese")] ) @@ -5635,7 +5968,7 @@ [(set (match_operand:V16QI 0 "register_operand" "=w,w") (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")] CRYPTO_AESMC))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_AES" "aes<aesmc_op>\\t%0.16b, %1.16b" [(set_attr "type" "crypto_aesmc") (set_attr_alternative "enabled" @@ -5646,6 +5979,44 @@ (const_string "yes")])] ) +;; When AESE/AESMC fusion is enabled we really want to keep the two together +;; and enforce the register dependency without scheduling or register +;; allocation messing up the order or introducing moves inbetween. +;; Mash the two together during combine. + +(define_insn "*aarch64_crypto_aese_fused" + [(set (match_operand:V16QI 0 "register_operand" "=&w") + (unspec:V16QI + [(unspec:V16QI + [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE) + ] UNSPEC_AESMC))] + "TARGET_SIMD && TARGET_AES + && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)" + "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b" + [(set_attr "type" "crypto_aese") + (set_attr "length" "8")] +) + +;; When AESD/AESIMC fusion is enabled we really want to keep the two together +;; and enforce the register dependency without scheduling or register +;; allocation messing up the order or introducing moves inbetween. +;; Mash the two together during combine. + +(define_insn "*aarch64_crypto_aesd_fused" + [(set (match_operand:V16QI 0 "register_operand" "=&w") + (unspec:V16QI + [(unspec:V16QI + [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD) + ] UNSPEC_AESIMC))] + "TARGET_SIMD && TARGET_AES + && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)" + "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b" + [(set_attr "type" "crypto_aese") + (set_attr "length" "8")] +) + ;; sha1 (define_insn "aarch64_crypto_sha1hsi" @@ -5653,7 +6024,7 @@ (unspec:SI [(match_operand:SI 1 "register_operand" "w")] UNSPEC_SHA1H))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha1h\\t%s0, %s1" [(set_attr "type" "crypto_sha1_fast")] ) @@ -5663,7 +6034,7 @@ (unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w") (parallel [(const_int 0)]))] UNSPEC_SHA1H))] - "TARGET_SIMD && TARGET_CRYPTO && !BYTES_BIG_ENDIAN" + "TARGET_SIMD && TARGET_SHA2 && !BYTES_BIG_ENDIAN" "sha1h\\t%s0, %s1" [(set_attr "type" "crypto_sha1_fast")] ) @@ -5673,7 +6044,7 @@ (unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w") (parallel [(const_int 3)]))] UNSPEC_SHA1H))] - "TARGET_SIMD && TARGET_CRYPTO && BYTES_BIG_ENDIAN" + "TARGET_SIMD && TARGET_SHA2 && BYTES_BIG_ENDIAN" "sha1h\\t%s0, %s1" [(set_attr "type" "crypto_sha1_fast")] ) @@ -5683,7 +6054,7 @@ (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") (match_operand:V4SI 2 "register_operand" "w")] UNSPEC_SHA1SU1))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha1su1\\t%0.4s, %2.4s" [(set_attr "type" "crypto_sha1_fast")] ) @@ -5694,7 +6065,7 @@ (match_operand:SI 2 "register_operand" "w") (match_operand:V4SI 3 "register_operand" "w")] CRYPTO_SHA1))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha1<sha1_op>\\t%q0, %s2, %3.4s" [(set_attr "type" "crypto_sha1_slow")] ) @@ -5705,7 +6076,7 @@ (match_operand:V4SI 2 "register_operand" "w") (match_operand:V4SI 3 "register_operand" "w")] UNSPEC_SHA1SU0))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha1su0\\t%0.4s, %2.4s, %3.4s" [(set_attr "type" "crypto_sha1_xor")] ) @@ -5718,7 +6089,7 @@ (match_operand:V4SI 2 "register_operand" "w") (match_operand:V4SI 3 "register_operand" "w")] CRYPTO_SHA256))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha256h<sha256_op>\\t%q0, %q2, %3.4s" [(set_attr "type" "crypto_sha256_slow")] ) @@ -5728,7 +6099,7 @@ (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") (match_operand:V4SI 2 "register_operand" "w")] UNSPEC_SHA256SU0))] - "TARGET_SIMD &&TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha256su0\\t%0.4s, %2.4s" [(set_attr "type" "crypto_sha256_fast")] ) @@ -5739,11 +6110,728 @@ (match_operand:V4SI 2 "register_operand" "w") (match_operand:V4SI 3 "register_operand" "w")] UNSPEC_SHA256SU1))] - "TARGET_SIMD &&TARGET_CRYPTO" + "TARGET_SIMD && TARGET_SHA2" "sha256su1\\t%0.4s, %2.4s, %3.4s" [(set_attr "type" "crypto_sha256_slow")] ) +;; sha512 + +(define_insn "aarch64_crypto_sha512h<sha512_op>qv2di" + [(set (match_operand:V2DI 0 "register_operand" "=w") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "w") + (match_operand:V2DI 3 "register_operand" "w")] + CRYPTO_SHA512))] + "TARGET_SIMD && TARGET_SHA3" + "sha512h<sha512_op>\\t%q0, %q2, %3.2d" + [(set_attr "type" "crypto_sha512")] +) + +(define_insn "aarch64_crypto_sha512su0qv2di" + [(set (match_operand:V2DI 0 "register_operand" "=w") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "w")] + UNSPEC_SHA512SU0))] + "TARGET_SIMD && TARGET_SHA3" + "sha512su0\\t%0.2d, %2.2d" + [(set_attr "type" "crypto_sha512")] +) + +(define_insn "aarch64_crypto_sha512su1qv2di" + [(set (match_operand:V2DI 0 "register_operand" "=w") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "w") + (match_operand:V2DI 3 "register_operand" "w")] + UNSPEC_SHA512SU1))] + "TARGET_SIMD && TARGET_SHA3" + "sha512su1\\t%0.2d, %2.2d, %3.2d" + [(set_attr "type" "crypto_sha512")] +) + +;; sha3 + +(define_insn "eor3q<mode>4" + [(set (match_operand:VQ_I 0 "register_operand" "=w") + (xor:VQ_I + (xor:VQ_I + (match_operand:VQ_I 2 "register_operand" "w") + (match_operand:VQ_I 3 "register_operand" "w")) + (match_operand:VQ_I 1 "register_operand" "w")))] + "TARGET_SIMD && TARGET_SHA3" + "eor3\\t%0.16b, %1.16b, %2.16b, %3.16b" + [(set_attr "type" "crypto_sha3")] +) + +(define_insn "aarch64_rax1qv2di" + [(set (match_operand:V2DI 0 "register_operand" "=w") + (xor:V2DI + (rotate:V2DI + (match_operand:V2DI 2 "register_operand" "w") + (const_int 1)) + (match_operand:V2DI 1 "register_operand" "w")))] + "TARGET_SIMD && TARGET_SHA3" + "rax1\\t%0.2d, %1.2d, %2.2d" + [(set_attr "type" "crypto_sha3")] +) + +(define_insn "aarch64_xarqv2di" + [(set (match_operand:V2DI 0 "register_operand" "=w") + (rotatert:V2DI + (xor:V2DI + (match_operand:V2DI 1 "register_operand" "%w") + (match_operand:V2DI 2 "register_operand" "w")) + (match_operand:SI 3 "aarch64_simd_shift_imm_di" "Usd")))] + "TARGET_SIMD && TARGET_SHA3" + "xar\\t%0.2d, %1.2d, %2.2d, %3" + [(set_attr "type" "crypto_sha3")] +) + +(define_insn "bcaxq<mode>4" + [(set (match_operand:VQ_I 0 "register_operand" "=w") + (xor:VQ_I + (and:VQ_I + (not:VQ_I (match_operand:VQ_I 3 "register_operand" "w")) + (match_operand:VQ_I 2 "register_operand" "w")) + (match_operand:VQ_I 1 "register_operand" "w")))] + "TARGET_SIMD && TARGET_SHA3" + "bcax\\t%0.16b, %1.16b, %2.16b, %3.16b" + [(set_attr "type" "crypto_sha3")] +) + +;; SM3 + +(define_insn "aarch64_sm3ss1qv4si" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "w") + (match_operand:V4SI 2 "register_operand" "w") + (match_operand:V4SI 3 "register_operand" "w")] + UNSPEC_SM3SS1))] + "TARGET_SIMD && TARGET_SM4" + "sm3ss1\\t%0.4s, %1.4s, %2.4s, %3.4s" + [(set_attr "type" "crypto_sm3")] +) + + +(define_insn "aarch64_sm3tt<sm3tt_op>qv4si" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "register_operand" "w") + (match_operand:V4SI 3 "register_operand" "w") + (match_operand:SI 4 "aarch64_imm2" "Ui2")] + CRYPTO_SM3TT))] + "TARGET_SIMD && TARGET_SM4" + "sm3tt<sm3tt_op>\\t%0.4s, %2.4s, %3.4s[%4]" + [(set_attr "type" "crypto_sm3")] +) + +(define_insn "aarch64_sm3partw<sm3part_op>qv4si" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "register_operand" "w") + (match_operand:V4SI 3 "register_operand" "w")] + CRYPTO_SM3PART))] + "TARGET_SIMD && TARGET_SM4" + "sm3partw<sm3part_op>\\t%0.4s, %2.4s, %3.4s" + [(set_attr "type" "crypto_sm3")] +) + +;; SM4 + +(define_insn "aarch64_sm4eqv4si" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0") + (match_operand:V4SI 2 "register_operand" "w")] + UNSPEC_SM4E))] + "TARGET_SIMD && TARGET_SM4" + "sm4e\\t%0.4s, %2.4s" + [(set_attr "type" "crypto_sm4")] +) + +(define_insn "aarch64_sm4ekeyqv4si" + [(set (match_operand:V4SI 0 "register_operand" "=w") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "w") + (match_operand:V4SI 2 "register_operand" "w")] + UNSPEC_SM4EKEY))] + "TARGET_SIMD && TARGET_SM4" + "sm4ekey\\t%0.4s, %1.4s, %2.4s" + [(set_attr "type" "crypto_sm4")] +) + +;; fp16fml + +(define_expand "aarch64_fml<f16mac1>l<f16quad>_low<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (unspec:VDQSF + [(match_operand:VDQSF 1 "register_operand" "0") + (match_operand:<VFMLA_W> 2 "register_operand" "w") + (match_operand:<VFMLA_W> 3 "register_operand" "w")] + VFMLA16_LOW))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, + <nunits> * 2, false); + rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, + <nunits> * 2, false); + + emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_low<mode> (operands[0], + operands[1], + operands[2], + operands[3], + p1, p2)); + DONE; + +}) + +(define_expand "aarch64_fml<f16mac1>l<f16quad>_high<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (unspec:VDQSF + [(match_operand:VDQSF 1 "register_operand" "0") + (match_operand:<VFMLA_W> 2 "register_operand" "w") + (match_operand:<VFMLA_W> 3 "register_operand" "w")] + VFMLA16_HIGH))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, <nunits> * 2, true); + rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, <nunits> * 2, true); + + emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_high<mode> (operands[0], + operands[1], + operands[2], + operands[3], + p1, p2)); + DONE; +}) + +(define_insn "aarch64_simd_fmlal<f16quad>_low<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (fma:VDQSF + (float_extend:VDQSF + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 2 "register_operand" "w") + (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" ""))) + (float_extend:VDQSF + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 3 "register_operand" "w") + (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" ""))) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlsl<f16quad>_low<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (fma:VDQSF + (float_extend:VDQSF + (neg:<VFMLA_SEL_W> + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 2 "register_operand" "w") + (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" "")))) + (float_extend:VDQSF + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 3 "register_operand" "w") + (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" ""))) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlal<f16quad>_high<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (fma:VDQSF + (float_extend:VDQSF + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 2 "register_operand" "w") + (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" ""))) + (float_extend:VDQSF + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 3 "register_operand" "w") + (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" ""))) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlsl<f16quad>_high<mode>" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (fma:VDQSF + (float_extend:VDQSF + (neg:<VFMLA_SEL_W> + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 2 "register_operand" "w") + (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" "")))) + (float_extend:VDQSF + (vec_select:<VFMLA_SEL_W> + (match_operand:<VFMLA_W> 3 "register_operand" "w") + (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" ""))) + (match_operand:VDQSF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_expand "aarch64_fml<f16mac1>l_lane_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "") + (match_operand:V4HF 2 "register_operand" "") + (match_operand:V4HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_imm2" "")] + VFMLA16_LOW))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, false); + rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_lowv2sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; +} +) + +(define_expand "aarch64_fml<f16mac1>l_lane_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "") + (match_operand:V4HF 2 "register_operand" "") + (match_operand:V4HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_imm2" "")] + VFMLA16_HIGH))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, true); + rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_highv2sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; +}) + +(define_insn "aarch64_simd_fmlal_lane_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlsl_lane_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (neg:V2HF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlal_lane_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal2\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlsl_lane_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (neg:V2HF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_expand "aarch64_fml<f16mac1>lq_laneq_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "") + (match_operand:V8HF 2 "register_operand" "") + (match_operand:V8HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_lane_imm3" "")] + VFMLA16_LOW))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, false); + rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_lowv4sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; +}) + +(define_expand "aarch64_fml<f16mac1>lq_laneq_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "") + (match_operand:V8HF 2 "register_operand" "") + (match_operand:V8HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_lane_imm3" "")] + VFMLA16_HIGH))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, true); + rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_highv4sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; +}) + +(define_insn "aarch64_simd_fmlalq_laneq_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlslq_laneq_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (neg:V4HF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlalq_laneq_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal2\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlslq_laneq_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (neg:V4HF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_expand "aarch64_fml<f16mac1>l_laneq_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "") + (match_operand:V4HF 2 "register_operand" "") + (match_operand:V8HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_lane_imm3" "")] + VFMLA16_LOW))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, false); + rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_lowv2sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; + +}) + +(define_expand "aarch64_fml<f16mac1>l_laneq_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "") + (unspec:V2SF [(match_operand:V2SF 1 "register_operand" "") + (match_operand:V4HF 2 "register_operand" "") + (match_operand:V8HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_lane_imm3" "")] + VFMLA16_HIGH))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, true); + rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_highv2sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; + +}) + +(define_insn "aarch64_simd_fmlal_laneq_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlsl_laneq_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (neg:V2HF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlal_laneq_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal2\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlsl_laneq_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=w") + (fma:V2SF + (float_extend:V2SF + (neg:V2HF + (vec_select:V2HF + (match_operand:V4HF 2 "register_operand" "w") + (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))) + (float_extend:V2SF + (vec_duplicate:V2HF + (vec_select:HF + (match_operand:V8HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")])))) + (match_operand:V2SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_expand "aarch64_fml<f16mac1>lq_lane_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "") + (match_operand:V8HF 2 "register_operand" "") + (match_operand:V4HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_imm2" "")] + VFMLA16_LOW))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, false); + rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_lowv4sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; +}) + +(define_expand "aarch64_fml<f16mac1>lq_lane_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "") + (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "") + (match_operand:V8HF 2 "register_operand" "") + (match_operand:V4HF 3 "register_operand" "") + (match_operand:SI 4 "aarch64_imm2" "")] + VFMLA16_HIGH))] + "TARGET_F16FML" +{ + rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, true); + rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4])); + + emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_highv4sf (operands[0], + operands[1], + operands[2], + operands[3], + p1, lane)); + DONE; +}) + +(define_insn "aarch64_simd_fmlalq_lane_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlslq_lane_lowv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (neg:V4HF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlalq_lane_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlal2\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + +(define_insn "aarch64_simd_fmlslq_lane_highv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (fma:V4SF + (float_extend:V4SF + (neg:V4HF + (vec_select:V4HF + (match_operand:V8HF 2 "register_operand" "w") + (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))) + (float_extend:V4SF + (vec_duplicate:V4HF + (vec_select:HF + (match_operand:V4HF 3 "register_operand" "x") + (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")])))) + (match_operand:V4SF 1 "register_operand" "0")))] + "TARGET_F16FML" + "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]" + [(set_attr "type" "neon_fp_mul_s")] +) + ;; pmull (define_insn "aarch64_crypto_pmulldi" @@ -5751,7 +6839,7 @@ (unspec:TI [(match_operand:DI 1 "register_operand" "w") (match_operand:DI 2 "register_operand" "w")] UNSPEC_PMULL))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_AES" "pmull\\t%0.1q, %1.1d, %2.1d" [(set_attr "type" "crypto_pmull")] ) @@ -5761,7 +6849,7 @@ (unspec:TI [(match_operand:V2DI 1 "register_operand" "w") (match_operand:V2DI 2 "register_operand" "w")] UNSPEC_PMULL2))] - "TARGET_SIMD && TARGET_CRYPTO" + "TARGET_SIMD && TARGET_AES" "pmull2\\t%0.1q, %1.2d, %2.2d" [(set_attr "type" "crypto_pmull")] )