Mercurial > hg > CbC > CbC_gcc
diff gcc/config/arm/neon.md @ 131:84e7813d76e9
gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 07:37:49 +0900 |
parents | 04ced10e8804 |
children | 1830386684a0 |
line wrap: on
line diff
--- a/gcc/config/arm/neon.md Fri Oct 27 22:46:09 2017 +0900 +++ b/gcc/config/arm/neon.md Thu Oct 25 07:37:49 2018 +0900 @@ -1,5 +1,5 @@ ;; ARM NEON coprocessor Machine Description -;; Copyright (C) 2006-2017 Free Software Foundation, Inc. +;; Copyright (C) 2006-2018 Free Software Foundation, Inc. ;; Written by CodeSourcery. ;; ;; This file is part of GCC. @@ -25,9 +25,9 @@ (define_insn "*neon_mov<mode>" [(set (match_operand:VDX 0 "nonimmediate_operand" - "=w,Un,w, w, ?r,?w,?r,?r, ?Us") + "=w,Un,w, w, ?r,?w,?r, ?Us") (match_operand:VDX 1 "general_operand" - " w,w, Dn,Uni, w, r, r, Usi,r"))] + " w,w, Dn,Uni, w, r, Usi,r"))] "TARGET_NEON && (register_operand (operands[0], <MODE>mode) || register_operand (operands[1], <MODE>mode))" @@ -61,12 +61,12 @@ } } [(set_attr "type" "neon_move<q>,neon_store1_1reg,neon_move<q>,\ - neon_load1_1reg, neon_to_gp<q>,neon_from_gp<q>,mov_reg,\ + neon_load1_1reg, neon_to_gp<q>,neon_from_gp<q>,\ neon_load1_2reg, neon_store1_2reg") - (set_attr "length" "4,4,4,4,4,4,8,8,8") - (set_attr "arm_pool_range" "*,*,*,1020,*,*,*,1020,*") - (set_attr "thumb2_pool_range" "*,*,*,1018,*,*,*,1018,*") - (set_attr "neg_pool_range" "*,*,*,1004,*,*,*,1004,*")]) + (set_attr "length" "4,4,4,4,4,4,8,8") + (set_attr "arm_pool_range" "*,*,*,1020,*,*,1020,*") + (set_attr "thumb2_pool_range" "*,*,*,1018,*,*,1018,*") + (set_attr "neg_pool_range" "*,*,*,1004,*,*,1004,*")]) (define_insn "*neon_mov<mode>" [(set (match_operand:VQXMOV 0 "nonimmediate_operand" @@ -113,6 +113,13 @@ (set_attr "thumb2_pool_range" "*,*,*,1018,*,*,*,1018,*") (set_attr "neg_pool_range" "*,*,*,996,*,*,*,996,*")]) +/* We define these mov expanders to match the standard mov$a optab to prevent + the mid-end from trying to do a subreg for these modes which is the most + inefficient way to expand the move. Also big-endian subreg's aren't + allowed for a subset of modes, See TARGET_CAN_CHANGE_MODE_CLASS. + Without these RTL generation patterns the mid-end would attempt to take a + sub-reg and may ICE if it can't. */ + (define_expand "movti" [(set (match_operand:TI 0 "nonimmediate_operand" "") (match_operand:TI 1 "general_operand" ""))] @@ -137,33 +144,15 @@ } }) -(define_expand "movv4hf" - [(set (match_operand:V4HF 0 "s_register_operand") - (match_operand:V4HF 1 "s_register_operand"))] - "TARGET_NEON && TARGET_FP16" -{ - /* We need to use force_reg to avoid TARGET_CAN_CHANGE_MODE_CLASS - causing an ICE on big-endian because it cannot extract subregs in - this case. */ +(define_expand "mov<mode>" + [(set (match_operand:VH 0 "s_register_operand") + (match_operand:VH 1 "s_register_operand"))] + "TARGET_NEON" +{ if (can_create_pseudo_p ()) { if (!REG_P (operands[0])) - operands[1] = force_reg (V4HFmode, operands[1]); - } -}) - -(define_expand "movv8hf" - [(set (match_operand:V8HF 0 "") - (match_operand:V8HF 1 ""))] - "TARGET_NEON && TARGET_FP16" -{ - /* We need to use force_reg to avoid TARGET_CAN_CHANGE_MODE_CLASS - causing an ICE on big-endian because it cannot extract subregs in - this case. */ - if (can_create_pseudo_p ()) - { - if (!REG_P (operands[0])) - operands[1] = force_reg (V8HFmode, operands[1]); + operands[1] = force_reg (<MODE>mode, operands[1]); } }) @@ -1180,12 +1169,12 @@ ) (define_insn_and_split "ashldi3_neon" - [(set (match_operand:DI 0 "s_register_operand" "= w, w,?&r,?r,?&r, ?w,w") - (ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0, r, 0w,w") - (match_operand:SI 2 "general_operand" "rUm, i, r, i, i,rUm,i"))) - (clobber (match_scratch:SI 3 "= X, X,?&r, X, X, X,X")) - (clobber (match_scratch:SI 4 "= X, X,?&r, X, X, X,X")) - (clobber (match_scratch:DI 5 "=&w, X, X, X, X, &w,X")) + [(set (match_operand:DI 0 "s_register_operand" "= w, w, &r, r, &r, ?w,?w") + (ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0, r, 0w, w") + (match_operand:SI 2 "general_operand" "rUm, i, r, i, i,rUm, i"))) + (clobber (match_scratch:SI 3 "= X, X, &r, X, X, X, X")) + (clobber (match_scratch:SI 4 "= X, X, &r, X, X, X, X")) + (clobber (match_scratch:DI 5 "=&w, X, X, X, X, &w, X")) (clobber (reg:CC_C CC_REGNUM))] "TARGET_NEON" "#" @@ -1221,12 +1210,8 @@ gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1]) || REGNO (operands[0]) == REGNO (operands[1])); - if (operands[2] == CONST1_RTX (SImode)) - /* This clobbers CC. */ - emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1])); - else - arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1], - operands[2], operands[3], operands[4]); + arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1], + operands[2], operands[3], operands[4]); } DONE; }" @@ -1280,7 +1265,7 @@ ;; ashrdi3_neon ;; lshrdi3_neon (define_insn_and_split "<shift>di3_neon" - [(set (match_operand:DI 0 "s_register_operand" "= w, w,?&r,?r,?&r,?w,?w") + [(set (match_operand:DI 0 "s_register_operand" "= w, w, &r, r, &r,?w,?w") (RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0, r,0w, w") (match_operand:SI 2 "reg_or_int_operand" " r, i, r, i, i, r, i"))) (clobber (match_scratch:SI 3 "=2r, X, &r, X, X,2r, X")) @@ -1325,13 +1310,9 @@ gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1]) || REGNO (operands[0]) == REGNO (operands[1])); - if (operands[2] == CONST1_RTX (SImode)) - /* This clobbers CC. */ - emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1])); - else - /* This clobbers CC (ASHIFTRT by register only). */ - arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1], - operands[2], operands[3], operands[4]); + /* This clobbers CC (ASHIFTRT by register only). */ + arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1], + operands[2], operands[3], operands[4]); } DONE; @@ -2298,6 +2279,406 @@ DONE; }) +;; The expand RTL structure here is not important. +;; We use the gen_* functions anyway. +;; We just need something to wrap the iterators around. + +(define_expand "neon_vfm<vfml_op>l_<vfml_half><mode>" + [(set (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF + [(match_operand:VCVTF 1 "s_register_operand") + (PLUSMINUS:<VFML> + (match_operand:<VFML> 2 "s_register_operand") + (match_operand:<VFML> 3 "s_register_operand"))] VFMLHALVES))] + "TARGET_FP16FML" +{ + rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>); + emit_insn (gen_vfm<vfml_op>l_<vfml_half><mode>_intrinsic (operands[0], + operands[1], + operands[2], + operands[3], + half, half)); + DONE; +}) + +(define_insn "vfmal_low<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" ""))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_low" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmsl_high<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" "")))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_high" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmal_high<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" ""))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_high" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmsl_low<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" "")))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_low" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>" + [(set:VCVTF (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF + [(match_operand:VCVTF 1 "s_register_operand") + (PLUSMINUS:<VFML> + (match_operand:<VFML> 2 "s_register_operand") + (match_operand:<VFML> 3 "s_register_operand")) + (match_operand:SI 4 "const_int_operand")] VFMLHALVES))] + "TARGET_FP16FML" +{ + rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[4]))); + rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>); + emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><mode>_intrinsic + (operands[0], operands[1], + operands[2], operands[3], + half, lane)); + DONE; +}) + +(define_insn "vfmal_lane_low<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" ""))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFML> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode)); + return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>" + [(set:VCVTF (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF + [(match_operand:VCVTF 1 "s_register_operand") + (PLUSMINUS:<VFML> + (match_operand:<VFML> 2 "s_register_operand") + (match_operand:<VFMLSEL2> 3 "s_register_operand")) + (match_operand:SI 4 "const_int_operand")] VFMLHALVES))] + "TARGET_FP16FML" +{ + rtx lane + = GEN_INT (NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[4]))); + rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>); + emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>_intrinsic + (operands[0], operands[1], operands[2], operands[3], + half, lane)); + DONE; +}) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmal_lane_low<vfmlsel2><mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" ""))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFMLSEL2> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[2] and operands[3] in the halved VFMLSEL modes + because we want the print_operand code to print the appropriate + S or D register prefix. */ + operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff); + operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2])); + return "vfmal.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmal_lane_high<vfmlsel2><mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" ""))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFMLSEL2> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[3] in the halved VFMLSEL mode + because we've calculated the correct half-width subreg to extract + the lane from and we want to print *that* subreg instead. */ + operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff); + return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmal_lane_high<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" ""))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFML> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode)); + return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmsl_lane_low<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" "")))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFML> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode)); + return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmsl_lane_low<vfmlsel2><mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" "")))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFMLSEL2> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[2] and operands[3] in the halved VFMLSEL modes + because we want the print_operand code to print the appropriate + S or D register prefix. */ + operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff); + operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2])); + return "vfmsl.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +;; Used to implement the intrinsics: +;; float32x4_t vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane) +;; float32x2_t vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane) +;; Needs a bit of care to get the modes of the different sub-expressions right +;; due to 'a' and 'b' having different sizes and make sure we use the right +;; S or D subregister to select the appropriate lane from. + +(define_insn "vfmsl_lane_high<vfmlsel2><mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" "")))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFMLSEL2> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5])); + int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode); + int new_lane = lane % elts_per_reg; + int regdiff = lane / elts_per_reg; + operands[5] = GEN_INT (new_lane); + /* We re-create operands[3] in the halved VFMLSEL mode + because we've calculated the correct half-width subreg to extract + the lane from and we want to print *that* subreg instead. */ + operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff); + return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]"; + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmsl_lane_high<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" "")))) + (float_extend:VCVTF + (vec_duplicate:<VFMLSEL> + (vec_select:HF + (match_operand:<VFML> 3 "s_register_operand" "x") + (parallel [(match_operand:SI 5 "const_int_operand" "n")])))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + { + int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5])); + if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1) + { + operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode)); + return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]"; + } + else + { + operands[5] = GEN_INT (lane); + return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]"; + } + } + [(set_attr "type" "neon_fp_mla_s<q>")] +) + ; Used for intrinsics when flag_unsafe_math_optimizations is false. (define_insn "neon_vmla<mode>_unspec" @@ -3122,15 +3503,10 @@ "{ rtx v_bitmask_cast; rtx v_bitmask = gen_reg_rtx (<VCVTF:V_cmp_result>mode); - int i, n_elt = GET_MODE_NUNITS (<MODE>mode); - rtvec v = rtvec_alloc (n_elt); - - /* Create bitmask for vector select. */ - for (i = 0; i < n_elt; ++i) - RTVEC_ELT (v, i) = GEN_INT (0x80000000); + rtx c = GEN_INT (0x80000000); emit_move_insn (v_bitmask, - gen_rtx_CONST_VECTOR (<VCVTF:V_cmp_result>mode, v)); + gen_const_vec_duplicate (<VCVTF:V_cmp_result>mode, c)); emit_move_insn (operands[0], operands[2]); v_bitmask_cast = simplify_gen_subreg (<MODE>mode, v_bitmask, <VCVTF:V_cmp_result>mode, 0); @@ -6319,28 +6695,22 @@ }) (define_insn "neon_vabd<mode>_2" - [(set (match_operand:VDQ 0 "s_register_operand" "=w") - (abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w") - (match_operand:VDQ 2 "s_register_operand" "w"))))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + [(set (match_operand:VF 0 "s_register_operand" "=w") + (abs:VF (minus:VF (match_operand:VF 1 "s_register_operand" "w") + (match_operand:VF 2 "s_register_operand" "w"))))] + "TARGET_NEON && flag_unsafe_math_optimizations" "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set (attr "type") - (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0)) - (const_string "neon_fp_abd_s<q>") - (const_string "neon_abd<q>")))] + [(set_attr "type" "neon_fp_abd_s<q>")] ) (define_insn "neon_vabd<mode>_3" - [(set (match_operand:VDQ 0 "s_register_operand" "=w") - (abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w") - (match_operand:VDQ 2 "s_register_operand" "w")] - UNSPEC_VSUB)))] - "TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)" + [(set (match_operand:VF 0 "s_register_operand" "=w") + (abs:VF (unspec:VF [(match_operand:VF 1 "s_register_operand" "w") + (match_operand:VF 2 "s_register_operand" "w")] + UNSPEC_VSUB)))] + "TARGET_NEON && flag_unsafe_math_optimizations" "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set (attr "type") - (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0)) - (const_string "neon_fp_abd_s<q>") - (const_string "neon_abd<q>")))] + [(set_attr "type" "neon_fp_abd_s<q>")] ) ;; Copy from core-to-neon regs, then extend, not vice-versa