CbC/CbC_gcc: gcc/config/arm/neon.md comparison

comparison gcc/config/arm/neon.md @ 131:84e7813d76e9

gcc-8.2

author	mir3636
date	Thu, 25 Oct 2018 07:37:49 +0900
parents	04ced10e8804
children	1830386684a0

comparison

equal deleted inserted replaced

-:04ced10e8804
+:84e7813d76e9
 ;; ARM NEON coprocessor Machine Description
-;; Copyright (C) 2006-2017 Free Software Foundation, Inc.
+;; Copyright (C) 2006-2018 Free Software Foundation, Inc.
 ;; Written by CodeSourcery.
 ;;
 ;; This file is part of GCC.
 ;;
 ;; GCC is free software; you can redistribute it and/or modify it
 ;; type attribute definitions.
 (define_attr "vqh_mnem" "vadd,vmin,vmax" (const_string "vadd"))
 (define_insn "*neon_mov<mode>"
 [(set (match_operand:VDX 0 "nonimmediate_operand"
-	  "=w,Un,w, w,  ?r,?w,?r,?r, ?Us")
+	  "=w,Un,w, w,  ?r,?w,?r, ?Us")
 	(match_operand:VDX 1 "general_operand"
-	  " w,w, Dn,Uni, w, r, r, Usi,r"))]
+	  " w,w, Dn,Uni, w, r, Usi,r"))]
 "TARGET_NEON
 && (register_operand (operands[0], <MODE>mode)
 || register_operand (operands[1], <MODE>mode))"
 {
 if (which_alternative == 2)
 case 5: return "vmov\t%P0, %Q1, %R1  @ <mode>";
 default: return output_move_double (operands, true, NULL);
 }
 }
 [(set_attr "type" "neon_move<q>,neon_store1_1reg,neon_move<q>,\
-neon_load1_1reg, neon_to_gp<q>,neon_from_gp<q>,mov_reg,\
+neon_load1_1reg, neon_to_gp<q>,neon_from_gp<q>,\
 neon_load1_2reg, neon_store1_2reg")
-(set_attr "length" "4,4,4,4,4,4,8,8,8")
+(set_attr "length" "4,4,4,4,4,4,8,8")
-(set_attr "arm_pool_range"     "*,*,*,1020,*,*,*,1020,*")
+(set_attr "arm_pool_range"     "*,*,*,1020,*,*,1020,*")
-(set_attr "thumb2_pool_range"     "*,*,*,1018,*,*,*,1018,*")
+(set_attr "thumb2_pool_range"     "*,*,*,1018,*,*,1018,*")
-(set_attr "neg_pool_range" "*,*,*,1004,*,*,*,1004,*")])
+(set_attr "neg_pool_range" "*,*,*,1004,*,*,1004,*")])
 (define_insn "*neon_mov<mode>"
 [(set (match_operand:VQXMOV 0 "nonimmediate_operand"
 	  "=w,Un,w, w,  ?r,?w,?r,?r,  ?Us")
 	(match_operand:VQXMOV 1 "general_operand"
 (set_attr "length" "4,8,4,8,8,8,16,8,16")
 (set_attr "arm_pool_range" "*,*,*,1020,*,*,*,1020,*")
 (set_attr "thumb2_pool_range" "*,*,*,1018,*,*,*,1018,*")
 (set_attr "neg_pool_range" "*,*,*,996,*,*,*,996,*")])
+/* We define these mov expanders to match the standard mov$a optab to prevent
+the mid-end from trying to do a subreg for these modes which is the most
+inefficient way to expand the move.  Also big-endian subreg's aren't
+allowed for a subset of modes, See TARGET_CAN_CHANGE_MODE_CLASS.
+Without these RTL generation patterns the mid-end would attempt to take a
+sub-reg and may ICE if it can't.  */
 (define_expand "movti"
 [(set (match_operand:TI 0 "nonimmediate_operand" "")
 	(match_operand:TI 1 "general_operand" ""))]
 "TARGET_NEON"
 {
 if (!REG_P (operands[0]))
 	operands[1] = force_reg (<MODE>mode, operands[1]);
 }
 })
-(define_expand "movv4hf"
+(define_expand "mov<mode>"
-[(set (match_operand:V4HF 0 "s_register_operand")
+[(set (match_operand:VH 0 "s_register_operand")
-	(match_operand:V4HF 1 "s_register_operand"))]
+	(match_operand:VH 1 "s_register_operand"))]
-"TARGET_NEON && TARGET_FP16"
+"TARGET_NEON"
 {
-/* We need to use force_reg to avoid TARGET_CAN_CHANGE_MODE_CLASS
-causing an ICE on big-endian because it cannot extract subregs in
-this case.  */
 if (can_create_pseudo_p ())
 {
 if (!REG_P (operands[0]))
-	operands[1] = force_reg (V4HFmode, operands[1]);
+	operands[1] = force_reg (<MODE>mode, operands[1]);
-}
-})
-(define_expand "movv8hf"
-[(set (match_operand:V8HF 0 "")
-	(match_operand:V8HF 1 ""))]
-"TARGET_NEON && TARGET_FP16"
-{
-/* We need to use force_reg to avoid TARGET_CAN_CHANGE_MODE_CLASS
-causing an ICE on big-endian because it cannot extract subregs in
-this case.  */
-if (can_create_pseudo_p ())
-{
-if (!REG_P (operands[0]))
-	operands[1] = force_reg (V8HFmode, operands[1]);
 }
 })
 (define_insn "*neon_mov<mode>"
 [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 vshl.u64\t%P0, %P1, %P2"
 [(set_attr "type" "neon_shift_imm, neon_shift_reg")]
 )
 (define_insn_and_split "ashldi3_neon"
-[(set (match_operand:DI 0 "s_register_operand"	    "= w, w,?&r,?r,?&r, ?w,w")
+[(set (match_operand:DI 0 "s_register_operand"	    "= w, w, &r, r, &r, ?w,?w")
-	(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w,w")
+	(ashift:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r, 0w, w")
-		   (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm,i")))
+		   (match_operand:SI 2 "general_operand"    "rUm, i,  r, i,  i,rUm, i")))
-(clobber (match_scratch:SI 3				    "= X, X,?&r, X,  X,  X,X"))
+(clobber (match_scratch:SI 3				    "= X, X, &r, X,  X,  X, X"))
-(clobber (match_scratch:SI 4				    "= X, X,?&r, X,  X,  X,X"))
+(clobber (match_scratch:SI 4				    "= X, X, &r, X,  X,  X, X"))
-(clobber (match_scratch:DI 5				    "=&w, X,  X, X,  X, &w,X"))
+(clobber (match_scratch:DI 5				    "=&w, X,  X, X,  X, &w, X"))
 (clobber (reg:CC_C CC_REGNUM))]
 "TARGET_NEON"
 "#"
 "TARGET_NEON && reload_completed"
 [(const_int 0)]
 {
 	/* The shift expanders support either full overlap or no overlap.  */
 	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
 		    || REGNO (operands[0]) == REGNO (operands[1]));
-	if (operands[2] == CONST1_RTX (SImode))
+	arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
-	  /* This clobbers CC.  */
+				       operands[2], operands[3], operands[4]);
-	  emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
-	else
-	  arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
-					 operands[2], operands[3], operands[4]);
 }
 DONE;
 }"
 [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
 (set_attr "opt" "*,*,speed,speed,speed,*,*")
 )
 ;; ashrdi3_neon
 ;; lshrdi3_neon
 (define_insn_and_split "<shift>di3_neon"
-[(set (match_operand:DI 0 "s_register_operand"	     "= w, w,?&r,?r,?&r,?w,?w")
+[(set (match_operand:DI 0 "s_register_operand"	     "= w, w, &r, r, &r,?w,?w")
 	(RSHIFTS:DI (match_operand:DI 1 "s_register_operand" " 0w, w, 0r, 0,  r,0w, w")
 		    (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i,  i, r, i")))
 (clobber (match_scratch:SI 3				     "=2r, X, &r, X,  X,2r, X"))
 (clobber (match_scratch:SI 4				     "= X, X, &r, X,  X, X, X"))
 (clobber (match_scratch:DI 5				     "=&w, X,  X, X, X,&w, X"))
 {
 	/* The shift expanders support either full overlap or no overlap.  */
 	gcc_assert (!reg_overlap_mentioned_p (operands[0], operands[1])
 		    || REGNO (operands[0]) == REGNO (operands[1]));
-	if (operands[2] == CONST1_RTX (SImode))
+	/* This clobbers CC (ASHIFTRT by register only).  */
-	  /* This clobbers CC.  */
+	arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
-	  emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1]));
+				       operands[2], operands[3], operands[4]);
-	else
-	  /* This clobbers CC (ASHIFTRT by register only).  */
-	  arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
-				 	 operands[2], operands[3], operands[4]);
 }
 DONE;
 }"
 [(set_attr "arch" "neon_for_64bits,neon_for_64bits,*,*,*,avoid_neon_for_64bits,avoid_neon_for_64bits")
 emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3],
 					 operands[1]));
 DONE;
 })
+;; The expand RTL structure here is not important.
+;; We use the gen_* functions anyway.
+;; We just need something to wrap the iterators around.
+(define_expand "neon_vfm<vfml_op>l_<vfml_half><mode>"
+[(set (match_operand:VCVTF 0 "s_register_operand")
+(unspec:VCVTF
+	[(match_operand:VCVTF 1 "s_register_operand")
+	   (PLUSMINUS:<VFML>
+	     (match_operand:<VFML> 2 "s_register_operand")
+	     (match_operand:<VFML> 3 "s_register_operand"))] VFMLHALVES))]
+"TARGET_FP16FML"
+{
+rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+emit_insn (gen_vfm<vfml_op>l_<vfml_half><mode>_intrinsic (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     half, half));
+DONE;
+})
+(define_insn "vfmal_low<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+"vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_insn "vfmsl_high<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (neg:<VFMLSEL>
+	    (vec_select:<VFMLSEL>
+	      (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	      (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+"vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_insn "vfmal_high<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+"vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_insn "vfmsl_low<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (neg:<VFMLSEL>
+	    (vec_select:<VFMLSEL>
+	      (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	      (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+"vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>"
+[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
+(unspec:VCVTF
+	[(match_operand:VCVTF 1 "s_register_operand")
+	 (PLUSMINUS:<VFML>
+	   (match_operand:<VFML> 2 "s_register_operand")
+	   (match_operand:<VFML> 3 "s_register_operand"))
+	 (match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
+"TARGET_FP16FML"
+{
+rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[4])));
+rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><mode>_intrinsic
+					       (operands[0], operands[1],
+						operands[2], operands[3],
+						half, lane));
+DONE;
+})
+(define_insn "vfmal_lane_low<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFML> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+{
+	operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+	return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
+}
+else
+{
+	operands[5] = GEN_INT (lane);
+	return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
+}
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>"
+[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
+(unspec:VCVTF
+	[(match_operand:VCVTF 1 "s_register_operand")
+	 (PLUSMINUS:<VFML>
+	   (match_operand:<VFML> 2 "s_register_operand")
+	   (match_operand:<VFMLSEL2> 3 "s_register_operand"))
+	 (match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
+"TARGET_FP16FML"
+{
+rtx lane
+= GEN_INT (NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[4])));
+rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>_intrinsic
+		(operands[0], operands[1], operands[2], operands[3],
+		 half, lane));
+DONE;
+})
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+(define_insn "vfmal_lane_low<vfmlsel2><mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+int new_lane = lane % elts_per_reg;
+int regdiff = lane / elts_per_reg;
+operands[5] = GEN_INT (new_lane);
+/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
+because we want the print_operand code to print the appropriate
+S or D register prefix.  */
+operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
+return "vfmal.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+(define_insn "vfmal_lane_high<vfmlsel2><mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+int new_lane = lane % elts_per_reg;
+int regdiff = lane / elts_per_reg;
+operands[5] = GEN_INT (new_lane);
+/* We re-create operands[3] in the halved VFMLSEL mode
+because we've calculated the correct half-width subreg to extract
+the lane from and we want to print *that* subreg instead.  */
+operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_insn "vfmal_lane_high<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (vec_select:<VFMLSEL>
+	   (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	   (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFML> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+{
+	operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+	return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
+}
+else
+{
+	operands[5] = GEN_INT (lane);
+	return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
+}
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_insn "vfmsl_lane_low<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (neg:<VFMLSEL>
+	    (vec_select:<VFMLSEL>
+	      (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	      (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFML> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+{
+	operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+	return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
+}
+else
+{
+	operands[5] = GEN_INT (lane);
+	return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
+}
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+(define_insn "vfmsl_lane_low<vfmlsel2><mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (neg:<VFMLSEL>
+	    (vec_select:<VFMLSEL>
+	      (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	      (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+int new_lane = lane % elts_per_reg;
+int regdiff = lane / elts_per_reg;
+operands[5] = GEN_INT (new_lane);
+/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
+because we want the print_operand code to print the appropriate
+S or D register prefix.  */
+operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
+return "vfmsl.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+(define_insn "vfmsl_lane_high<vfmlsel2><mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (neg:<VFMLSEL>
+	    (vec_select:<VFMLSEL>
+	     (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	     (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+int new_lane = lane % elts_per_reg;
+int regdiff = lane / elts_per_reg;
+operands[5] = GEN_INT (new_lane);
+/* We re-create operands[3] in the halved VFMLSEL mode
+because we've calculated the correct half-width subreg to extract
+the lane from and we want to print *that* subreg instead.  */
+operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
+(define_insn "vfmsl_lane_high<mode>_intrinsic"
+[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+	(fma:VCVTF
+	 (float_extend:VCVTF
+	  (neg:<VFMLSEL>
+	    (vec_select:<VFMLSEL>
+	     (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+	     (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+	 (float_extend:VCVTF
+	   (vec_duplicate:<VFMLSEL>
+	     (vec_select:HF
+	       (match_operand:<VFML> 3 "s_register_operand" "x")
+	       (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+	 (match_operand:VCVTF 1 "s_register_operand" "0")))]
+"TARGET_FP16FML"
+{
+int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+{
+	operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+	return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
+}
+else
+{
+	operands[5] = GEN_INT (lane);
+	return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
+}
+}
+[(set_attr "type" "neon_fp_mla_s<q>")]
+)
 ; Used for intrinsics when flag_unsafe_math_optimizations is false.
 (define_insn "neon_vmla<mode>_unspec"
 [(set (match_operand:VDQW 0 "s_register_operand" "=w")
 	(unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
 (match_operand:VCVTF 2 "register_operand")]
 "TARGET_NEON"
 "{
 rtx v_bitmask_cast;
 rtx v_bitmask = gen_reg_rtx (<VCVTF:V_cmp_result>mode);
-int i, n_elt = GET_MODE_NUNITS (<MODE>mode);
+rtx c = GEN_INT (0x80000000);
-rtvec v = rtvec_alloc (n_elt);
-/* Create bitmask for vector select.  */
-for (i = 0; i < n_elt; ++i)
-RTVEC_ELT (v, i) = GEN_INT (0x80000000);
 emit_move_insn (v_bitmask,
-		     gen_rtx_CONST_VECTOR (<VCVTF:V_cmp_result>mode, v));
+		     gen_const_vec_duplicate (<VCVTF:V_cmp_result>mode, c));
 emit_move_insn (operands[0], operands[2]);
 v_bitmask_cast = simplify_gen_subreg (<MODE>mode, v_bitmask,
 					   <VCVTF:V_cmp_result>mode, 0);
 emit_insn (gen_neon_vbsl<mode> (operands[0], v_bitmask_cast, operands[0],
 				     operands[1]));
 emit_insn (gen_neon_vec_pack_trunc_<V_double> (operands[0], tempreg));
 DONE;
 })
 (define_insn "neon_vabd<mode>_2"
-[(set (match_operand:VDQ 0 "s_register_operand" "=w")
+[(set (match_operand:VF 0 "s_register_operand" "=w")
-(abs:VDQ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+(abs:VF (minus:VF (match_operand:VF 1 "s_register_operand" "w")
-(match_operand:VDQ 2 "s_register_operand" "w"))))]
+			 (match_operand:VF 2 "s_register_operand" "w"))))]
-"TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+"TARGET_NEON && flag_unsafe_math_optimizations"
 "vabd.<V_s_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
-[(set (attr "type")
+[(set_attr "type" "neon_fp_abd_s<q>")]
-(if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
-(const_string "neon_fp_abd_s<q>")
-(const_string "neon_abd<q>")))]
 )
 (define_insn "neon_vabd<mode>_3"
-[(set (match_operand:VDQ 0 "s_register_operand" "=w")
+[(set (match_operand:VF 0 "s_register_operand" "=w")
-(abs:VDQ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+(abs:VF (unspec:VF [(match_operand:VF 1 "s_register_operand" "w")
-(match_operand:VDQ 2 "s_register_operand" "w")]
+			    (match_operand:VF 2 "s_register_operand" "w")]
-UNSPEC_VSUB)))]
+		UNSPEC_VSUB)))]
-"TARGET_NEON && (!<Is_float_mode> || flag_unsafe_math_optimizations)"
+"TARGET_NEON && flag_unsafe_math_optimizations"
 "vabd.<V_if_elem> %<V_reg>0, %<V_reg>1, %<V_reg>2"
-[(set (attr "type")
+[(set_attr "type" "neon_fp_abd_s<q>")]
-(if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
-(const_string "neon_fp_abd_s<q>")
-(const_string "neon_abd<q>")))]
 )
 ;; Copy from core-to-neon regs, then extend, not vice-versa
 (define_split

Mercurial > hg > CbC > CbC_gcc

comparison gcc/config/arm/neon.md @ 131:84e7813d76e9