diff gcc/config/aarch64/aarch64-simd.md @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/config/aarch64/aarch64-simd.md	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/config/aarch64/aarch64-simd.md	Thu Oct 25 07:37:49 2018 +0900
@@ -1,5 +1,5 @@
 ;; Machine description for AArch64 AdvSIMD architecture.
-;; Copyright (C) 2011-2017 Free Software Foundation, Inc.
+;; Copyright (C) 2011-2018 Free Software Foundation, Inc.
 ;; Contributed by ARM Ltd.
 ;;
 ;; This file is part of GCC.
@@ -31,9 +31,9 @@
      normal str, so the check need not apply.  */
   if (GET_CODE (operands[0]) == MEM
       && !(aarch64_simd_imm_zero (operands[1], <MODE>mode)
-	   && ((GET_MODE_SIZE (<MODE>mode) == 16
+	   && ((known_eq (GET_MODE_SIZE (<MODE>mode), 16)
 		&& aarch64_mem_pair_operand (operands[0], DImode))
-	       || GET_MODE_SIZE (<MODE>mode) == 8)))
+	       || known_eq (GET_MODE_SIZE (<MODE>mode), 8))))
       operands[1] = force_reg (<MODE>mode, operands[1]);
   "
 )
@@ -80,7 +80,7 @@
           )))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]";
   }
   [(set_attr "type" "neon_dup<q>")]
@@ -95,14 +95,13 @@
           )))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
     return "dup\\t%0.<Vtype>, %1.<Vetype>[%2]";
   }
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<mode>"
+(define_insn "*aarch64_simd_mov<VD:mode>"
   [(set (match_operand:VD 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VD 1 "general_operand"
@@ -121,19 +120,18 @@
      case 5: return "fmov\t%d0, %1";
      case 6: return "mov\t%0, %1";
      case 7:
-	return aarch64_output_simd_mov_immediate (operands[1],
-						  <MODE>mode, 64);
+	return aarch64_output_simd_mov_immediate (operands[1], 64);
      default: gcc_unreachable ();
      }
 }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\
+  [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
 		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<mode>"
+(define_insn "*aarch64_simd_mov<VQ:mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
@@ -155,13 +153,13 @@
     case 6:
 	return "#";
     case 7:
-	return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128);
+	return aarch64_output_simd_mov_immediate (operands[1], 128);
     default:
 	gcc_unreachable ();
     }
 }
-  [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\
-		     neon_stp, neon_logic<q>, multiple, multiple,\
+  [(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\
+		     neon_logic<q>, multiple, multiple,\
 		     multiple, neon_move<q>")
    (set_attr "length" "4,4,4,4,8,8,8,4")]
 )
@@ -174,39 +172,67 @@
 	(vec_select:<VEL> (match_operand:VALL_F16 1 "register_operand" "w")
 			(parallel [(match_operand 2 "const_int_operand" "n")])))]
   "TARGET_SIMD
-   && ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])) == 0"
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 0"
   "str\\t%<Vetype>1, %0"
   [(set_attr "type" "neon_store1_1reg<q>")]
 )
 
-(define_insn "load_pair<mode>"
-  [(set (match_operand:VD 0 "register_operand" "=w")
-	(match_operand:VD 1 "aarch64_mem_pair_operand" "Ump"))
-   (set (match_operand:VD 2 "register_operand" "=w")
-	(match_operand:VD 3 "memory_operand" "m"))]
+(define_insn "load_pair<DREG:mode><DREG2:mode>"
+  [(set (match_operand:DREG 0 "register_operand" "=w")
+	(match_operand:DREG 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:DREG2 2 "register_operand" "=w")
+	(match_operand:DREG2 3 "memory_operand" "m"))]
   "TARGET_SIMD
    && rtx_equal_p (XEXP (operands[3], 0),
 		   plus_constant (Pmode,
 				  XEXP (operands[1], 0),
-				  GET_MODE_SIZE (<MODE>mode)))"
+				  GET_MODE_SIZE (<DREG:MODE>mode)))"
   "ldp\\t%d0, %d2, %1"
   [(set_attr "type" "neon_ldp")]
 )
 
-(define_insn "store_pair<mode>"
-  [(set (match_operand:VD 0 "aarch64_mem_pair_operand" "=Ump")
-	(match_operand:VD 1 "register_operand" "w"))
-   (set (match_operand:VD 2 "memory_operand" "=m")
-	(match_operand:VD 3 "register_operand" "w"))]
+(define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
+  [(set (match_operand:DREG 0 "aarch64_mem_pair_operand" "=Ump")
+	(match_operand:DREG 1 "register_operand" "w"))
+   (set (match_operand:DREG2 2 "memory_operand" "=m")
+	(match_operand:DREG2 3 "register_operand" "w"))]
   "TARGET_SIMD
    && rtx_equal_p (XEXP (operands[2], 0),
 		   plus_constant (Pmode,
 				  XEXP (operands[0], 0),
-				  GET_MODE_SIZE (<MODE>mode)))"
+				  GET_MODE_SIZE (<DREG:MODE>mode)))"
   "stp\\t%d1, %d3, %0"
   [(set_attr "type" "neon_stp")]
 )
 
+(define_insn "load_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "register_operand" "=w")
+	(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:VQ2 2 "register_operand" "=w")
+	(match_operand:VQ2 3 "memory_operand" "m"))]
+  "TARGET_SIMD
+    && rtx_equal_p (XEXP (operands[3], 0),
+		    plus_constant (Pmode,
+			       XEXP (operands[1], 0),
+			       GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "ldp\\t%q0, %q2, %1"
+  [(set_attr "type" "neon_ldp_q")]
+)
+
+(define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
+  [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump")
+	(match_operand:VQ 1 "register_operand" "w"))
+   (set (match_operand:VQ2 2 "memory_operand" "=m")
+	(match_operand:VQ2 3 "register_operand" "w"))]
+  "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0),
+		plus_constant (Pmode,
+			       XEXP (operands[0], 0),
+			       GET_MODE_SIZE (<VQ:MODE>mode)))"
+  "stp\\t%q1, %q3, %0"
+  [(set_attr "type" "neon_stp_q")]
+)
+
+
 (define_split
   [(set (match_operand:VQ 0 "register_operand" "")
       (match_operand:VQ 1 "register_operand" ""))]
@@ -231,7 +257,7 @@
   DONE;
 })
 
-(define_expand "aarch64_split_simd_mov<mode>"
+(define_expand "@aarch64_split_simd_mov<mode>"
   [(set (match_operand:VQ 0)
         (match_operand:VQ 1))]
   "TARGET_SIMD"
@@ -254,8 +280,8 @@
       {
         rtx dst_low_part = gen_lowpart (<VHALF>mode, dst);
         rtx dst_high_part = gen_highpart (<VHALF>mode, dst);
-        rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
-        rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+	rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+	rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
 
         emit_insn
           (gen_aarch64_simd_mov_from_<mode>low (dst_low_part, src, lo));
@@ -451,8 +477,7 @@
 		DOTPROD)))]
   "TARGET_DOTPROD"
   {
-    operands[4]
-      = GEN_INT (ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4]));
     return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
   }
   [(set_attr "type" "neon_dot")]
@@ -467,8 +492,7 @@
 		DOTPROD)))]
   "TARGET_DOTPROD"
   {
-    operands[4]
-      = GEN_INT (ENDIAN_LANE_N (V16QImode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4]));
     return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
   }
   [(set_attr "type" "neon_dot")]
@@ -502,7 +526,7 @@
       (match_operand:VMUL 3 "register_operand" "w")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
   }
   [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
@@ -518,8 +542,7 @@
       (match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
     return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
   }
   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
@@ -536,7 +559,7 @@
   [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 
-(define_insn "aarch64_rsqrte<mode>"
+(define_insn "@aarch64_rsqrte<mode>"
   [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
 	(unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")]
 		     UNSPEC_RSQRTE))]
@@ -544,7 +567,7 @@
   "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
   [(set_attr "type" "neon_fp_rsqrte_<stype><q>")])
 
-(define_insn "aarch64_rsqrts<mode>"
+(define_insn "@aarch64_rsqrts<mode>"
   [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
 	(unspec:VHSDF_HSDF [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
 			    (match_operand:VHSDF_HSDF 2 "register_operand" "w")]
@@ -572,7 +595,7 @@
        (match_operand:DF 3 "register_operand" "w")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2]));
     return "fmul\\t%0.2d, %3.2d, %1.d[%2]";
   }
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
@@ -617,6 +640,67 @@
   [(set_attr "type" "neon_abd<q>")]
 )
 
+(define_insn "aarch64_<sur>abdl2<mode>_3"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+	(unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
+			  (match_operand:VDQV_S 2 "register_operand" "w")]
+	ABDL2))]
+  "TARGET_SIMD"
+  "<sur>abdl2\t%0.<Vwtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_abd<q>")]
+)
+
+(define_insn "aarch64_<sur>abal<mode>_4"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+	(unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
+			  (match_operand:VDQV_S 2 "register_operand" "w")
+			 (match_operand:<VDBLW> 3 "register_operand" "0")]
+	ABAL))]
+  "TARGET_SIMD"
+  "<sur>abal\t%0.<Vwtype>, %1.<Vhalftype>, %2.<Vhalftype>"
+  [(set_attr "type" "neon_arith_acc<q>")]
+)
+
+(define_insn "aarch64_<sur>adalp<mode>_3"
+  [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+	(unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
+			  (match_operand:<VDBLW> 2 "register_operand" "0")]
+	ADALP))]
+  "TARGET_SIMD"
+  "<sur>adalp\t%0.<Vwtype>, %1.<Vtype>"
+  [(set_attr "type" "neon_reduc_add<q>")]
+)
+
+;; Emit a sequence to produce a sum-of-absolute-differences of the V16QI
+;; inputs in operands 1 and 2.  The sequence also has to perform a widening
+;; reduction of the difference into a V4SI vector and accumulate that into
+;; operand 3 before copying that into the result operand 0.
+;; Perform that with a sequence of:
+;; UABDL2	tmp.8h, op1.16b, op2.16b
+;; UABAL	tmp.8h, op1.16b, op2.16b
+;; UADALP	op3.4s, tmp.8h
+;; MOV		op0, op3 // should be eliminated in later passes.
+;; The signed version just uses the signed variants of the above instructions.
+
+(define_expand "<sur>sadv16qi"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (unspec:V16QI [(use (match_operand:V16QI 1 "register_operand"))
+		  (use (match_operand:V16QI 2 "register_operand"))] ABAL)
+   (use (match_operand:V4SI 3 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    rtx reduc = gen_reg_rtx (V8HImode);
+    emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
+					       operands[2]));
+    emit_insn (gen_aarch64_<sur>abalv16qi_4 (reduc, operands[1],
+					      operands[2], reduc));
+    emit_insn (gen_aarch64_<sur>adalpv8hi_3 (operands[3], reduc,
+					      operands[3]));
+    emit_move_insn (operands[0], operands[3]);
+    DONE;
+  }
+)
+
 (define_insn "aba<mode>_3"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
 	(plus:VDQ_BHSI (abs:VDQ_BHSI (minus:VDQ_BHSI
@@ -651,8 +735,8 @@
       case 0:
 	return "and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>";
       case 1:
-	return aarch64_output_simd_mov_immediate (operands[2],
-	   <MODE>mode, GET_MODE_BITSIZE (<MODE>mode), AARCH64_CHECK_BIC);
+	return aarch64_output_simd_mov_immediate (operands[2], <bitsize>,
+						  AARCH64_CHECK_BIC);
       default:
 	gcc_unreachable ();
       }
@@ -672,8 +756,8 @@
       case 0:
 	return "orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>";
       case 1:
-	return aarch64_output_simd_mov_immediate (operands[2],
-		<MODE>mode, GET_MODE_BITSIZE (<MODE>mode), AARCH64_CHECK_ORR);
+	return aarch64_output_simd_mov_immediate (operands[2], <bitsize>,
+						  AARCH64_CHECK_ORR);
       default:
 	gcc_unreachable ();
       }
@@ -699,29 +783,29 @@
 )
 
 (define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w,w,w")
-        (vec_merge:VDQ_BHSI
-	    (vec_duplicate:VDQ_BHSI
-		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "r,w,Utv"))
-	    (match_operand:VDQ_BHSI 3 "register_operand" "0,0,0")
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w,w,w")
+	(vec_merge:VALL_F16
+	    (vec_duplicate:VALL_F16
+		(match_operand:<VEL> 1 "aarch64_simd_general_operand" "w,?r,Utv"))
+	    (match_operand:VALL_F16 3 "register_operand" "0,0,0")
 	    (match_operand:SI 2 "immediate_operand" "i,i,i")))]
   "TARGET_SIMD"
   {
-   int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+   int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
    operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
    switch (which_alternative)
      {
      case 0:
-	return "ins\\t%0.<Vetype>[%p2], %w1";
+	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
      case 1:
-	return "ins\\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
+	return "ins\\t%0.<Vetype>[%p2], %<vwcore>1";
      case 2:
         return "ld1\\t{%0.<Vetype>}[%p2], %1";
      default:
 	gcc_unreachable ();
      }
   }
-  [(set_attr "type" "neon_from_gp<q>, neon_ins<q>, neon_load1_one_lane<q>")]
+  [(set_attr "type" "neon_ins<q>, neon_from_gp<q>, neon_load1_one_lane<q>")]
 )
 
 (define_insn "*aarch64_simd_vec_copy_lane<mode>"
@@ -736,9 +820,9 @@
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD"
   {
-    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
     operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));
 
     return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
   }
@@ -757,10 +841,10 @@
 	    (match_operand:SI 2 "immediate_operand" "i")))]
   "TARGET_SIMD"
   {
-    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
+    int elt = ENDIAN_LANE_N (<nunits>, exact_log2 (INTVAL (operands[2])));
     operands[2] = GEN_INT (HOST_WIDE_INT_1 << elt);
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-			   INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode,
+					   INTVAL (operands[4]));
 
     return "ins\t%0.<Vetype>[%p2], %3.<Vetype>[%4]";
   }
@@ -1035,19 +1119,6 @@
   }
 )
 
-(define_expand "vec_set<mode>"
-  [(match_operand:VDQ_BHSI 0 "register_operand")
-   (match_operand:<VEL> 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_set<mode> (operands[0], operands[1],
-					    GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
 ;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
 (define_insn "vec_shr_<mode>"
   [(set (match_operand:VD 0 "register_operand" "=w")
@@ -1064,62 +1135,8 @@
   [(set_attr "type" "neon_shift_imm")]
 )
 
-(define_insn "aarch64_simd_vec_setv2di"
-  [(set (match_operand:V2DI 0 "register_operand" "=w,w")
-        (vec_merge:V2DI
-	    (vec_duplicate:V2DI
-		(match_operand:DI 1 "register_operand" "r,w"))
-	    (match_operand:V2DI 3 "register_operand" "0,0")
-	    (match_operand:SI 2 "immediate_operand" "i,i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (V2DImode, exact_log2 (INTVAL (operands[2])));
-    operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
-    switch (which_alternative)
-      {
-      case 0:
-	return "ins\\t%0.d[%p2], %1";
-      case 1:
-        return "ins\\t%0.d[%p2], %1.d[0]";
-      default:
-	gcc_unreachable ();
-      }
-  }
-  [(set_attr "type" "neon_from_gp, neon_ins_q")]
-)
-
-(define_expand "vec_setv2di"
-  [(match_operand:V2DI 0 "register_operand")
-   (match_operand:DI 1 "register_operand")
-   (match_operand:SI 2 "immediate_operand")]
-  "TARGET_SIMD"
-  {
-    HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
-    emit_insn (gen_aarch64_simd_vec_setv2di (operands[0], operands[1],
-					  GEN_INT (elem), operands[0]));
-    DONE;
-  }
-)
-
-(define_insn "aarch64_simd_vec_set<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
-	(vec_merge:VDQF_F16
-	    (vec_duplicate:VDQF_F16
-		(match_operand:<VEL> 1 "register_operand" "w"))
-	    (match_operand:VDQF_F16 3 "register_operand" "0")
-	    (match_operand:SI 2 "immediate_operand" "i")))]
-  "TARGET_SIMD"
-  {
-    int elt = ENDIAN_LANE_N (<MODE>mode, exact_log2 (INTVAL (operands[2])));
-
-    operands[2] = GEN_INT ((HOST_WIDE_INT)1 << elt);
-    return "ins\t%0.<Vetype>[%p2], %1.<Vetype>[0]";
-  }
-  [(set_attr "type" "neon_ins<q>")]
-)
-
 (define_expand "vec_set<mode>"
-  [(match_operand:VDQF_F16 0 "register_operand" "+w")
+  [(match_operand:VALL_F16 0 "register_operand" "+w")
    (match_operand:<VEL> 1 "register_operand" "w")
    (match_operand:SI 2 "immediate_operand" "")]
   "TARGET_SIMD"
@@ -1155,7 +1172,7 @@
 	 (match_operand:VDQHS 4 "register_operand" "0")))]
  "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
@@ -1173,8 +1190,7 @@
 	 (match_operand:VDQHS 4 "register_operand" "0")))]
  "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
     return "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
@@ -1214,7 +1230,7 @@
 	   (match_operand:VDQHS 3 "register_operand" "w"))))]
  "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
@@ -1232,8 +1248,7 @@
 	   (match_operand:VDQHS 3 "register_operand" "w"))))]
  "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
     return "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
@@ -1337,9 +1352,8 @@
    fmov\\t%d0, %1
    dup\\t%d0, %1"
   [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
 )
 
 (define_insn "move_lo_quad_internal_<mode>"
@@ -1353,9 +1367,8 @@
    fmov\\t%d0, %1
    dup\\t%d0, %1"
   [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
@@ -1369,9 +1382,8 @@
    fmov\\t%d0, %1
    dup\\t%d0, %1"
   [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
@@ -1385,9 +1397,8 @@
    fmov\\t%d0, %1
    dup\\t%d0, %1"
   [(set_attr "type" "neon_dup<q>,f_mcr,neon_dup<q>")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")
-   (set_attr "length" "4")]
+   (set_attr "length" "4")
+   (set_attr "arch" "simd,fp,simd")]
 )
 
 (define_expand "move_lo_quad_<mode>"
@@ -1441,7 +1452,7 @@
   (match_operand:<VHALF> 1 "register_operand" "")]
  "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
   if (BYTES_BIG_ENDIAN)
     emit_insn (gen_aarch64_simd_move_hi_quad_be_<mode> (operands[0],
 		    operands[1], p));
@@ -1505,7 +1516,7 @@
 			       (match_operand:VQW 2 "vect_par_cnst_lo_half" "")
 			    )))]
   "TARGET_SIMD"
-  "<su>shll\t%0.<Vwtype>, %1.<Vhalftype>, 0"
+  "<su>xtl\t%0.<Vwtype>, %1.<Vhalftype>"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
@@ -1516,7 +1527,7 @@
 			       (match_operand:VQW 2 "vect_par_cnst_hi_half" "")
 			    )))]
   "TARGET_SIMD"
-  "<su>shll2\t%0.<Vwtype>, %1.<Vtype>, 0"
+  "<su>xtl2\t%0.<Vwtype>, %1.<Vtype>"
   [(set_attr "type" "neon_shift_imm_long")]
 )
 
@@ -1525,7 +1536,7 @@
    (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
   "TARGET_SIMD"
   {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
     emit_insn (gen_aarch64_simd_vec_unpack<su>_hi_<mode> (operands[0],
 							  operands[1], p));
     DONE;
@@ -1537,7 +1548,7 @@
    (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))]
   "TARGET_SIMD"
   {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
     emit_insn (gen_aarch64_simd_vec_unpack<su>_lo_<mode> (operands[0],
 							  operands[1], p));
     DONE;
@@ -1657,7 +1668,7 @@
    (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
  "TARGET_SIMD"
  {
-   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
    emit_insn (gen_aarch64_simd_vec_<su>mult_lo_<mode> (operands[0],
 						       operands[1],
 						       operands[2], p));
@@ -1684,7 +1695,7 @@
    (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
  "TARGET_SIMD"
  {
-   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
    emit_insn (gen_aarch64_simd_vec_<su>mult_hi_<mode> (operands[0],
 						       operands[1],
 						       operands[2], p));
@@ -1803,7 +1814,7 @@
       (match_operand:VDQF 4 "register_operand" "0")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1820,8 +1831,7 @@
       (match_operand:VDQSF 4 "register_operand" "0")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
     return "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1849,7 +1859,7 @@
       (match_operand:DF 4 "register_operand" "0")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2]));
     return "fmla\\t%0.2d, %3.2d, %1.2d[%2]";
   }
   [(set_attr "type" "neon_fp_mla_d_scalar_q")]
@@ -1858,9 +1868,8 @@
 (define_insn "fnma<mode>4"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
 	(fma:VHSDF
-	  (match_operand:VHSDF 1 "register_operand" "w")
-          (neg:VHSDF
-	    (match_operand:VHSDF 2 "register_operand" "w"))
+	  (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w"))
+	  (match_operand:VHSDF 2 "register_operand" "w")
 	  (match_operand:VHSDF 3 "register_operand" "0")))]
   "TARGET_SIMD"
   "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
@@ -1879,7 +1888,7 @@
       (match_operand:VDQF 4 "register_operand" "0")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1897,8 +1906,7 @@
       (match_operand:VDQSF 4 "register_operand" "0")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[2]));
     return "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]";
   }
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
@@ -1928,7 +1936,7 @@
       (match_operand:DF 4 "register_operand" "0")))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (V2DFmode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (V2DFmode, INTVAL (operands[2]));
     return "fmls\\t%0.2d, %3.2d, %1.2d[%2]";
   }
   [(set_attr "type" "neon_fp_mla_d_scalar_q")]
@@ -2090,7 +2098,7 @@
    (match_operand:VQ_HSF 1 "register_operand" "")]
   "TARGET_SIMD"
   {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
     emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
 						       operands[1], p));
     DONE;
@@ -2113,7 +2121,7 @@
    (match_operand:VQ_HSF 1 "register_operand" "")]
   "TARGET_SIMD"
   {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
     emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
 						       operands[1], p));
     DONE;
@@ -2221,8 +2229,9 @@
 ;; Max/Min are introduced by idiom recognition by GCC's mid-end.  An
 ;; expression like:
 ;;      a = (b < c) ? b : c;
-;; is idiom-matched as MIN_EXPR<b,c> only if -ffinite-math-only is enabled
-;; either explicitly or indirectly via -ffast-math.
+;; is idiom-matched as MIN_EXPR<b,c> only if -ffinite-math-only and
+;; -fno-signed-zeros are enabled either explicitly or indirectly via
+;; -ffast-math.
 ;;
 ;; MIN_EXPR and MAX_EXPR eventually map to 'smin' and 'smax' in RTL.
 ;; The 'smax' and 'smin' RTL standard pattern names do not specify which
@@ -2261,7 +2270,7 @@
 	       UNSPEC_ADDV)]
   "TARGET_SIMD"
   {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
     rtx scratch = gen_reg_rtx (<MODE>mode);
     emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
     emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
@@ -2312,7 +2321,7 @@
 		    UNSPEC_FADDV))]
  "TARGET_SIMD"
 {
-  rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
+  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
   rtx scratch = gen_reg_rtx (V4SFmode);
   emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
   emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
@@ -2347,14 +2356,14 @@
 ;; 'across lanes' max and min ops.
 
 ;; Template for outputting a scalar, so we can create __builtins which can be
-;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code.  (This is FP smax/smin).
+;; gimple_fold'd to the IFN_REDUC_(MAX|MIN) function.  (This is FP smax/smin).
 (define_expand "reduc_<maxmin_uns>_scal_<mode>"
   [(match_operand:<VEL> 0 "register_operand")
    (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")]
 		  FMAXMINV)]
   "TARGET_SIMD"
   {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
     rtx scratch = gen_reg_rtx (<MODE>mode);
     emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch,
 							      operands[1]));
@@ -2370,7 +2379,7 @@
 		    MAXMINV)]
   "TARGET_SIMD"
   {
-    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
     rtx scratch = gen_reg_rtx (<MODE>mode);
     emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch,
 							      operands[1]));
@@ -2427,13 +2436,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_INT_EQUIV> 3)
 	))]
   "TARGET_SIMD"
@@ -2451,14 +2460,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:<V_INT_EQUIV> 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2467,6 +2476,100 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
+(define_insn_and_split "aarch64_simd_bsldi_alt"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,w,0,r")
+	       (match_operand:DI 2 "register_operand" "w,0,w,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 2)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %3.8b, %2.8b
+  bit\\t%0.8b, %3.8b, %1.8b
+  bif\\t%0.8b, %2.8b, %1.8b
+  #"
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_INT_EQUIV> 1 "register_operand")
@@ -2646,10 +2749,10 @@
 	  break;
 	}
       /* Fall through.  */
-    case UNGE:
+    case UNLT:
       std::swap (operands[2], operands[3]);
       /* Fall through.  */
-    case UNLE:
+    case UNGT:
     case GT:
       comparison = gen_aarch64_cmgt<mode>;
       break;
@@ -2660,10 +2763,10 @@
 	  break;
 	}
       /* Fall through.  */
-    case UNGT:
+    case UNLE:
       std::swap (operands[2], operands[3]);
       /* Fall through.  */
-    case UNLT:
+    case UNGE:
     case GE:
       comparison = gen_aarch64_cmge<mode>;
       break;
@@ -2674,6 +2777,7 @@
     case UNEQ:
     case ORDERED:
     case UNORDERED:
+    case LTGT:
       break;
     default:
       gcc_unreachable ();
@@ -2685,21 +2789,41 @@
     case UNGT:
     case UNLE:
     case UNLT:
-    case NE:
-      /* FCM returns false for lanes which are unordered, so if we use
-	 the inverse of the comparison we actually want to emit, then
-	 invert the result, we will end up with the correct result.
-	 Note that a NE NaN and NaN NE b are true for all a, b.
-
-	 Our transformations are:
-	 a UNGE b -> !(b GT a)
-	 a UNGT b -> !(b GE a)
-	 a UNLE b -> !(a GT b)
-	 a UNLT b -> !(a GE b)
-	 a   NE b -> !(a EQ b)  */
-      gcc_assert (comparison != NULL);
-      emit_insn (comparison (operands[0], operands[2], operands[3]));
-      emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
+      {
+	/* All of the above must not raise any FP exceptions.  Thus we first
+	   check each operand for NaNs and force any elements containing NaN to
+	   zero before using them in the compare.
+	   Example: UN<cc> (a, b) -> UNORDERED (a, b) |
+				     (cm<cc> (isnan (a) ? 0.0 : a,
+					      isnan (b) ? 0.0 : b))
+	   We use the following transformations for doing the comparisions:
+	   a UNGE b -> a GE b
+	   a UNGT b -> a GT b
+	   a UNLE b -> b GE a
+	   a UNLT b -> b GT a.  */
+
+	rtx tmp0 = gen_reg_rtx (<V_INT_EQUIV>mode);
+	rtx tmp1 = gen_reg_rtx (<V_INT_EQUIV>mode);
+	rtx tmp2 = gen_reg_rtx (<V_INT_EQUIV>mode);
+	emit_insn (gen_aarch64_cmeq<mode> (tmp0, operands[2], operands[2]));
+	emit_insn (gen_aarch64_cmeq<mode> (tmp1, operands[3], operands[3]));
+	emit_insn (gen_and<v_int_equiv>3 (tmp2, tmp0, tmp1));
+	emit_insn (gen_and<v_int_equiv>3 (tmp0, tmp0,
+					  lowpart_subreg (<V_INT_EQUIV>mode,
+							  operands[2],
+							  <MODE>mode)));
+	emit_insn (gen_and<v_int_equiv>3 (tmp1, tmp1,
+					  lowpart_subreg (<V_INT_EQUIV>mode,
+							  operands[3],
+							  <MODE>mode)));
+	gcc_assert (comparison != NULL);
+	emit_insn (comparison (operands[0],
+			       lowpart_subreg (<MODE>mode,
+					       tmp0, <V_INT_EQUIV>mode),
+			       lowpart_subreg (<MODE>mode,
+					       tmp1, <V_INT_EQUIV>mode)));
+	emit_insn (gen_orn<v_int_equiv>3 (operands[0], tmp2, operands[0]));
+      }
       break;
 
     case LT:
@@ -2707,42 +2831,46 @@
     case GT:
     case GE:
     case EQ:
+    case NE:
       /* The easy case.  Here we emit one of FCMGE, FCMGT or FCMEQ.
 	 As a LT b <=> b GE a && a LE b <=> b GT a.  Our transformations are:
 	 a GE b -> a GE b
 	 a GT b -> a GT b
 	 a LE b -> b GE a
 	 a LT b -> b GT a
-	 a EQ b -> a EQ b  */
+	 a EQ b -> a EQ b
+	 a NE b -> ~(a EQ b)  */
       gcc_assert (comparison != NULL);
       emit_insn (comparison (operands[0], operands[2], operands[3]));
+      if (code == NE)
+	emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
       break;
 
-    case UNEQ:
-      /* We first check (a > b ||  b > a) which is !UNEQ, inverting
-	 this result will then give us (a == b || a UNORDERED b).  */
+    case LTGT:
+      /* LTGT is not guranteed to not generate a FP exception.  So let's
+	 go the faster way : ((a > b) || (b > a)).  */
       emit_insn (gen_aarch64_cmgt<mode> (operands[0],
 					 operands[2], operands[3]));
       emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[3], operands[2]));
       emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp));
-      emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
-      break;
-
-    case UNORDERED:
-      /* Operands are ORDERED iff (a > b || b >= a), so we can compute
-	 UNORDERED as !ORDERED.  */
-      emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[2], operands[3]));
-      emit_insn (gen_aarch64_cmge<mode> (operands[0],
-					 operands[3], operands[2]));
-      emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp));
-      emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
       break;
 
     case ORDERED:
-      emit_insn (gen_aarch64_cmgt<mode> (tmp, operands[2], operands[3]));
-      emit_insn (gen_aarch64_cmge<mode> (operands[0],
-					 operands[3], operands[2]));
-      emit_insn (gen_ior<v_int_equiv>3 (operands[0], operands[0], tmp));
+    case UNORDERED:
+    case UNEQ:
+      /* cmeq (a, a) & cmeq (b, b).  */
+      emit_insn (gen_aarch64_cmeq<mode> (operands[0],
+					 operands[2], operands[2]));
+      emit_insn (gen_aarch64_cmeq<mode> (tmp, operands[3], operands[3]));
+      emit_insn (gen_and<v_int_equiv>3 (operands[0], operands[0], tmp));
+
+      if (code == UNORDERED)
+	emit_insn (gen_one_cmpl<v_int_equiv>2 (operands[0], operands[0]));
+      else if (code == UNEQ)
+	{
+	  emit_insn (gen_aarch64_cmeq<mode> (tmp, operands[2], operands[3]));
+	  emit_insn (gen_orn<v_int_equiv>3 (operands[0], operands[0], tmp));
+	}
       break;
 
     default:
@@ -2895,37 +3023,38 @@
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]";
   }
-  [(set_attr "type" "neon_to_gp<q>")]
-)
-
-(define_insn "*aarch64_get_lane_zero_extendsi<mode>"
-  [(set (match_operand:SI 0 "register_operand" "=r")
-	(zero_extend:SI
-	  (vec_select:<VEL>
-	    (match_operand:VDQQH 1 "register_operand" "w")
-	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
-  "TARGET_SIMD"
-  {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
-    return "umov\\t%w0, %1.<Vetype>[%2]";
-  }
-  [(set_attr "type" "neon_to_gp<q>")]
+  [(set_attr "type" "neon_to_gp<q>")]
+)
+
+(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(zero_extend:GPI
+	  (vec_select:<VEL>
+	    (match_operand:VDQQH 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+  "TARGET_SIMD"
+  {
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+					   INTVAL (operands[2]));
+    return "umov\\t%w0, %1.<Vetype>[%2]";
+  }
+  [(set_attr "type" "neon_to_gp<q>")]
 )
 
 ;; Lane extraction of a value, neither sign nor zero extension
 ;; is guaranteed so upper bits should be considered undefined.
 ;; RTL uses GCC vector extension indices throughout so flip only for assembly.
 (define_insn "aarch64_get_lane<mode>"
-  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
+  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=?r, w, Utv")
 	(vec_select:<VEL>
 	  (match_operand:VALL_F16 1 "register_operand" "w, w, w")
 	  (parallel [(match_operand:SI 2 "immediate_operand" "i, i, i")])))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     switch (which_alternative)
       {
 	case 0:
@@ -2941,37 +3070,61 @@
   [(set_attr "type" "neon_to_gp<q>, neon_dup<q>, neon_store1_one_lane<q>")]
 )
 
+(define_insn "load_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "register_operand" "=w")
+	(vec_concat:<VDBL>
+	   (match_operand:VDC 1 "memory_operand" "Utq")
+	   (match_operand:VDC 2 "memory_operand" "m")))]
+  "TARGET_SIMD && !STRICT_ALIGNMENT
+   && rtx_equal_p (XEXP (operands[2], 0),
+		   plus_constant (Pmode,
+				  XEXP (operands[1], 0),
+				  GET_MODE_SIZE (<MODE>mode)))"
+  "ldr\\t%q0, %1"
+  [(set_attr "type" "neon_load1_1reg_q")]
+)
+
+(define_insn "store_pair_lanes<mode>"
+  [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn")
+	(vec_concat:<VDBL>
+	   (match_operand:VDC 1 "register_operand" "w, r")
+	   (match_operand:VDC 2 "register_operand" "w, r")))]
+  "TARGET_SIMD"
+  "@
+   stp\\t%d1, %d2, %y0
+   stp\\t%x1, %x2, %y0"
+  [(set_attr "type" "neon_stp, store_16")]
+)
+
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
 (define_insn "*aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
-        (vec_concat:<VDBL>
-	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")
-	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))]
+	(vec_concat:<VDBL>
+	  (match_operand:VDC 1 "general_operand" "w,?r,m")
+	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
    mov\\t%0.8b, %1.8b
    fmov\t%d0, %1
    ldr\\t%d0, %1"
   [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")]
+   (set_attr "arch" "simd,fp,simd")]
 )
 
 (define_insn "*aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
-	   (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")
-	   (match_operand:VD_BHSI 1 "general_operand" "w,?r,m")))]
+	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
+	  (match_operand:VDC 1 "general_operand" "w,?r,m")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    mov\\t%0.8b, %1.8b
    fmov\t%d0, %1
    ldr\\t%d0, %1"
   [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
-   (set_attr "simd" "yes,*,yes")
-   (set_attr "fp" "*,yes,*")]
+   (set_attr "arch" "simd,fp,simd")]
 )
 
 (define_expand "aarch64_combine<mode>"
@@ -2986,7 +3139,7 @@
 }
 )
 
-(define_expand "aarch64_simd_combine<mode>"
+(define_expand "@aarch64_simd_combine<mode>"
   [(match_operand:<VDBL> 0 "register_operand")
    (match_operand:VDC 1 "register_operand")
    (match_operand:VDC 2 "register_operand")]
@@ -3034,7 +3187,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_saddl<mode>_hi_internal (operands[0], operands[1],
                                                   operands[2], p));
   DONE;
@@ -3046,7 +3199,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_uaddl<mode>_hi_internal (operands[0], operands[1],
                                                   operands[2], p));
   DONE;
@@ -3058,7 +3211,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_ssubl<mode>_hi_internal (operands[0], operands[1],
 						operands[2], p));
   DONE;
@@ -3070,7 +3223,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_usubl<mode>_hi_internal (operands[0], operands[1],
 						operands[2], p));
   DONE;
@@ -3096,7 +3249,7 @@
 		      (match_operand:<VDBLW> 2 "register_operand" "")))]
   "TARGET_SIMD"
   {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
     rtx temp = gen_reg_rtx (GET_MODE (operands[0]));
 
     emit_insn (gen_aarch64_saddw<mode>_internal (temp, operands[2],
@@ -3124,7 +3277,7 @@
 		      (match_operand:<VDBLW> 2 "register_operand" "")))]
   "TARGET_SIMD"
   {
-    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, false);
+    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
     rtx temp = gen_reg_rtx (GET_MODE (operands[0]));
 
     emit_insn (gen_aarch64_uaddw<mode>_internal (temp, operands[2],
@@ -3145,38 +3298,74 @@
   DONE;
 })
 
-(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>"
+(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+	  (ANY_EXTEND:<VWIDE>
+	    (match_operand:VD_BHSI 2 "register_operand" "w"))))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>subw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_sub_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>subw<mode>_internal"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
-			(ANY_EXTEND:<VWIDE>
-			  (match_operand:VD_BHSI 2 "register_operand" "w"))))]
-  "TARGET_SIMD"
-  "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
-)
-
-(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>_internal"
+	(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+	  (ANY_EXTEND:<VWIDE>
+	    (vec_select:<VHALF>
+	      (match_operand:VQW 2 "register_operand" "w")
+	      (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>subw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
+  [(set_attr "type" "neon_sub_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>subw2<mode>_internal"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
-			(ANY_EXTEND:<VWIDE>
-			  (vec_select:<VHALF>
-			   (match_operand:VQW 2 "register_operand" "w")
-			   (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))))]
-  "TARGET_SIMD"
-  "<ANY_EXTEND:su><ADDSUB:optab>w\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
-  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
-)
-
-(define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>w2<mode>_internal"
+	(minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
+	  (ANY_EXTEND:<VWIDE>
+	    (vec_select:<VHALF>
+	      (match_operand:VQW 2 "register_operand" "w")
+	      (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>subw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_sub_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(plus:<VWIDE>
+	  (ANY_EXTEND:<VWIDE> (match_operand:VD_BHSI 2 "register_operand" "w"))
+	  (match_operand:<VWIDE> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>addw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>_internal"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-        (ADDSUB:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
-			(ANY_EXTEND:<VWIDE>
-			  (vec_select:<VHALF>
-			   (match_operand:VQW 2 "register_operand" "w")
-			   (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))))]
-  "TARGET_SIMD"
-  "<ANY_EXTEND:su><ADDSUB:optab>w2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<ADDSUB:optab>_widen")]
+	(plus:<VWIDE>
+	  (ANY_EXTEND:<VWIDE>
+	    (vec_select:<VHALF>
+	      (match_operand:VQW 2 "register_operand" "w")
+	      (match_operand:VQW 3 "vect_par_cnst_lo_half" "")))
+	  (match_operand:<VWIDE> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>addw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
+  [(set_attr "type" "neon_add_widen")]
+)
+
+(define_insn "aarch64_<ANY_EXTEND:su>addw2<mode>_internal"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(plus:<VWIDE>
+	  (ANY_EXTEND:<VWIDE>
+	    (vec_select:<VHALF>
+	      (match_operand:VQW 2 "register_operand" "w")
+	      (match_operand:VQW 3 "vect_par_cnst_hi_half" "")))
+	  (match_operand:<VWIDE> 1 "register_operand" "w")))]
+  "TARGET_SIMD"
+  "<ANY_EXTEND:su>addw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_add_widen")]
 )
 
 (define_expand "aarch64_saddw2<mode>"
@@ -3185,7 +3374,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_saddw2<mode>_internal (operands[0], operands[1],
 						operands[2], p));
   DONE;
@@ -3197,7 +3386,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_uaddw2<mode>_internal (operands[0], operands[1],
 						operands[2], p));
   DONE;
@@ -3210,7 +3399,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_ssubw2<mode>_internal (operands[0], operands[1],
 						operands[2], p));
   DONE;
@@ -3222,7 +3411,7 @@
    (match_operand:VQW 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_usubw2<mode>_internal (operands[0], operands[1],
 						operands[2], p));
   DONE;
@@ -3230,6 +3419,22 @@
 
 ;; <su><r>h<addsub>.
 
+(define_expand "<u>avg<mode>3_floor"
+  [(set (match_operand:VDQ_BHSI 0 "register_operand")
+	(unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")
+			  (match_operand:VDQ_BHSI 2 "register_operand")]
+			 HADD))]
+  "TARGET_SIMD"
+)
+
+(define_expand "<u>avg<mode>3_ceil"
+  [(set (match_operand:VDQ_BHSI 0 "register_operand")
+	(unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")
+			  (match_operand:VDQ_BHSI 2 "register_operand")]
+			 RHADD))]
+  "TARGET_SIMD"
+)
+
 (define_insn "aarch64_<sur>h<addsub><mode>"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
         (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand" "w")
@@ -3301,8 +3506,7 @@
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VSWAP_WIDTH>mode,
-					  INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VSWAP_WIDTH>mode, INTVAL (operands[3]));
     return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_fp_mul_<Vetype>_scalar<q>")]
@@ -3321,7 +3525,7 @@
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
     return "fmulx\t%<v>0<Vmtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
@@ -3355,7 +3559,7 @@
 	 UNSPEC_FMULX))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
     return "fmulx\t%<Vetype>0, %<Vetype>1, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "fmul<Vetype>")]
@@ -3441,7 +3645,7 @@
 	 VQDMULH))]
   "TARGET_SIMD"
   "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
    return \"sq<r>dmulh\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]\";"
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
 )
@@ -3456,7 +3660,7 @@
 	 VQDMULH))]
   "TARGET_SIMD"
   "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
    return \"sq<r>dmulh\\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[%3]\";"
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
 )
@@ -3471,7 +3675,7 @@
 	 VQDMULH))]
   "TARGET_SIMD"
   "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
    return \"sq<r>dmulh\\t%<v>0, %<v>1, %2.<v>[%3]\";"
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
 )
@@ -3486,7 +3690,7 @@
 	 VQDMULH))]
   "TARGET_SIMD"
   "*
-   operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+   operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
    return \"sq<r>dmulh\\t%<v>0, %<v>1, %2.<v>[%3]\";"
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar<q>")]
 )
@@ -3518,7 +3722,7 @@
 	  SQRDMLH_AS))]
    "TARGET_SIMD_RDMA"
    {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
      return
       "sqrdml<SQRDMLH_AS:rdma_as>h\\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[%4]";
    }
@@ -3536,7 +3740,7 @@
 	  SQRDMLH_AS))]
    "TARGET_SIMD_RDMA"
    {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
      return
       "sqrdml<SQRDMLH_AS:rdma_as>h\\t%<v>0, %<v>2, %3.<Vetype>[%4]";
    }
@@ -3556,7 +3760,7 @@
 	  SQRDMLH_AS))]
    "TARGET_SIMD_RDMA"
    {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
      return
       "sqrdml<SQRDMLH_AS:rdma_as>h\\t%0.<Vtype>, %2.<Vtype>, %3.<Vetype>[%4]";
    }
@@ -3574,7 +3778,7 @@
 	  SQRDMLH_AS))]
    "TARGET_SIMD_RDMA"
    {
-     operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+     operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
      return
       "sqrdml<SQRDMLH_AS:rdma_as>h\\t%<v>0, %<v>2, %3.<v>[%4]";
    }
@@ -3618,7 +3822,7 @@
 	    (const_int 1))))]
   "TARGET_SIMD"
   {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
     return
       "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
   }
@@ -3642,7 +3846,7 @@
 	    (const_int 1))))]
   "TARGET_SIMD"
   {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
     return
       "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
   }
@@ -3665,7 +3869,7 @@
 	    (const_int 1))))]
   "TARGET_SIMD"
   {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
     return
       "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
   }
@@ -3688,7 +3892,7 @@
 	    (const_int 1))))]
   "TARGET_SIMD"
   {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
     return
       "sqdml<SBINQOPS:as>l\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
   }
@@ -3743,7 +3947,7 @@
    (match_operand:VQ_HSI 3 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlal2<mode>_internal (operands[0], operands[1],
 						  operands[2], operands[3], p));
   DONE;
@@ -3756,7 +3960,7 @@
    (match_operand:VQ_HSI 3 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlsl2<mode>_internal (operands[0], operands[1],
 						  operands[2], operands[3], p));
   DONE;
@@ -3783,7 +3987,7 @@
 	      (const_int 1))))]
   "TARGET_SIMD"
   {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[4]));
     return
      "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
   }
@@ -3809,7 +4013,7 @@
 	      (const_int 1))))]
   "TARGET_SIMD"
   {
-    operands[4] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[4])));
+    operands[4] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[4]));
     return
      "sqdml<SBINQOPS:as>l2\\t%<vw2>0<Vmwtype>, %<v>2<Vmtype>, %3.<Vetype>[%4]";
   }
@@ -3824,7 +4028,7 @@
    (match_operand:SI 4 "immediate_operand" "i")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlal2_lane<mode>_internal (operands[0], operands[1],
 						       operands[2], operands[3],
 						       operands[4], p));
@@ -3839,7 +4043,7 @@
    (match_operand:SI 4 "immediate_operand" "i")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlal2_laneq<mode>_internal (operands[0], operands[1],
 						       operands[2], operands[3],
 						       operands[4], p));
@@ -3854,7 +4058,7 @@
    (match_operand:SI 4 "immediate_operand" "i")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlsl2_lane<mode>_internal (operands[0], operands[1],
 						       operands[2], operands[3],
 						       operands[4], p));
@@ -3869,7 +4073,7 @@
    (match_operand:SI 4 "immediate_operand" "i")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlsl2_laneq<mode>_internal (operands[0], operands[1],
 						       operands[2], operands[3],
 						       operands[4], p));
@@ -3902,7 +4106,7 @@
    (match_operand:<VEL> 3 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlal2_n<mode>_internal (operands[0], operands[1],
 						    operands[2], operands[3],
 						    p));
@@ -3916,7 +4120,7 @@
    (match_operand:<VEL> 3 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmlsl2_n<mode>_internal (operands[0], operands[1],
 						    operands[2], operands[3],
 						    p));
@@ -3956,7 +4160,7 @@
 	     (const_int 1)))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
     return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3977,7 +4181,7 @@
 	     (const_int 1)))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
     return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -3997,7 +4201,7 @@
 	     (const_int 1)))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
     return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -4017,7 +4221,7 @@
 	     (const_int 1)))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
     return "sqdmull\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -4070,7 +4274,7 @@
    (match_operand:VQ_HSI 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmull2<mode>_internal (operands[0], operands[1],
 						  operands[2], p));
   DONE;
@@ -4095,7 +4299,7 @@
 	     (const_int 1)))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCOND>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCOND>mode, INTVAL (operands[3]));
     return "sqdmull2\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -4118,7 +4322,7 @@
 	     (const_int 1)))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<VCONQ>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<VCONQ>mode, INTVAL (operands[3]));
     return "sqdmull2\\t%<vw2>0<Vmwtype>, %<v>1<Vmtype>, %2.<Vetype>[%3]";
   }
   [(set_attr "type" "neon_sat_mul_<Vetype>_scalar_long")]
@@ -4131,7 +4335,7 @@
    (match_operand:SI 3 "immediate_operand" "i")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmull2_lane<mode>_internal (operands[0], operands[1],
 						       operands[2], operands[3],
 						       p));
@@ -4145,7 +4349,7 @@
    (match_operand:SI 3 "immediate_operand" "i")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmull2_laneq<mode>_internal (operands[0], operands[1],
 						       operands[2], operands[3],
 						       p));
@@ -4178,7 +4382,7 @@
    (match_operand:<VEL> 2 "register_operand" "w")]
   "TARGET_SIMD"
 {
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, true);
+  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
   emit_insn (gen_aarch64_sqdmull2_n<mode>_internal (operands[0], operands[1],
 						    operands[2], p));
   DONE;
@@ -4343,7 +4547,7 @@
      (clobber (reg:CC CC_REGNUM))]
   "TARGET_SIMD"
   "#"
-  "reload_completed"
+  "&& reload_completed"
   [(set (match_operand:DI 0 "register_operand")
 	(neg:DI
 	  (COMPARISONS:DI
@@ -4406,7 +4610,7 @@
     (clobber (reg:CC CC_REGNUM))]
   "TARGET_SIMD"
   "#"
-  "reload_completed"
+  "&& reload_completed"
   [(set (match_operand:DI 0 "register_operand")
 	(neg:DI
 	  (UCOMPARISONS:DI
@@ -4477,7 +4681,7 @@
     (clobber (reg:CC CC_REGNUM))]
   "TARGET_SIMD"
   "#"
-  "reload_completed"
+  "&& reload_completed"
   [(set (match_operand:DI 0 "register_operand")
 	(neg:DI
 	  (ne:DI
@@ -4624,7 +4828,7 @@
 		   UNSPEC_LD2_LANE))]
   "TARGET_SIMD"
   {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
     return "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1";
   }
   [(set_attr "type" "neon_load2_one_lane")]
@@ -4640,7 +4844,7 @@
   if (BYTES_BIG_ENDIAN)
     {
       rtx tmp = gen_reg_rtx (OImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
       emit_insn (gen_aarch64_simd_ld2<mode> (tmp, operands[1]));
       emit_insn (gen_aarch64_rev_reglistoi (operands[0], tmp, mask));
     }
@@ -4668,7 +4872,7 @@
 		   UNSPEC_ST2_LANE))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "st2\\t{%S1.<Vetype> - %T1.<Vetype>}[%2], %0";
   }
   [(set_attr "type" "neon_store2_one_lane<q>")]
@@ -4684,7 +4888,7 @@
   if (BYTES_BIG_ENDIAN)
     {
       rtx tmp = gen_reg_rtx (OImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
       emit_insn (gen_aarch64_rev_reglistoi (tmp, operands[1], mask));
       emit_insn (gen_aarch64_simd_st2<mode> (operands[0], tmp));
     }
@@ -4722,7 +4926,7 @@
 		   UNSPEC_LD3_LANE))]
   "TARGET_SIMD"
 {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
     return "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1";
 }
   [(set_attr "type" "neon_load3_one_lane")]
@@ -4738,7 +4942,7 @@
   if (BYTES_BIG_ENDIAN)
     {
       rtx tmp = gen_reg_rtx (CImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
       emit_insn (gen_aarch64_simd_ld3<mode> (tmp, operands[1]));
       emit_insn (gen_aarch64_rev_reglistci (operands[0], tmp, mask));
     }
@@ -4766,7 +4970,7 @@
 		    UNSPEC_ST3_LANE))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "st3\\t{%S1.<Vetype> - %U1.<Vetype>}[%2], %0";
   }
   [(set_attr "type" "neon_store3_one_lane<q>")]
@@ -4782,7 +4986,7 @@
   if (BYTES_BIG_ENDIAN)
     {
       rtx tmp = gen_reg_rtx (CImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
       emit_insn (gen_aarch64_rev_reglistci (tmp, operands[1], mask));
       emit_insn (gen_aarch64_simd_st3<mode> (operands[0], tmp));
     }
@@ -4820,7 +5024,7 @@
 		   UNSPEC_LD4_LANE))]
   "TARGET_SIMD"
 {
-    operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+    operands[3] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[3]));
     return "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1";
 }
   [(set_attr "type" "neon_load4_one_lane")]
@@ -4836,7 +5040,7 @@
   if (BYTES_BIG_ENDIAN)
     {
       rtx tmp = gen_reg_rtx (XImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
       emit_insn (gen_aarch64_simd_ld4<mode> (tmp, operands[1]));
       emit_insn (gen_aarch64_rev_reglistxi (operands[0], tmp, mask));
     }
@@ -4864,7 +5068,7 @@
 		    UNSPEC_ST4_LANE))]
   "TARGET_SIMD"
   {
-    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
     return "st4\\t{%S1.<Vetype> - %V1.<Vetype>}[%2], %0";
   }
   [(set_attr "type" "neon_store4_one_lane<q>")]
@@ -4880,7 +5084,7 @@
   if (BYTES_BIG_ENDIAN)
     {
       rtx tmp = gen_reg_rtx (XImode);
-      rtx mask = aarch64_reverse_mask (<MODE>mode);
+      rtx mask = aarch64_reverse_mask (<MODE>mode, <nunits>);
       emit_insn (gen_aarch64_rev_reglistxi (tmp, operands[1], mask));
       emit_insn (gen_aarch64_simd_st4<mode> (operands[0], tmp));
     }
@@ -4928,6 +5132,70 @@
     }
 })
 
+
+(define_expand "aarch64_ld1x3<VALLDIF:mode>"
+  [(match_operand:CI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x3_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x3_<mode>"
+  [(set (match_operand:CI 0 "register_operand" "=w")
+        (unspec:CI
+	  [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %U0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_3reg<q>")]
+)
+
+(define_expand "aarch64_st1x2<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:OI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (OImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x2_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x2_<mode>"
+   [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
+	 (unspec:OI
+	  [(match_operand:OI 1 "register_operand" "w")
+          (unspec:VALLDIF [(const_int 2)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %T1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_2reg<q>")]
+)
+
+(define_expand "aarch64_st1x3<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:CI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (CImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x3_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x3_<mode>"
+   [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
+	(unspec:CI
+         [(match_operand:CI 1 "register_operand" "w")
+	  (unspec:VALLDIF [(const_int 3)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %U1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_3reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
@@ -5176,6 +5444,33 @@
   DONE;
 })
 
+(define_expand "aarch64_ld1x2<VQ:mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+  (match_operand:DI 1 "register_operand" "r")
+  (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  machine_mode mode = OImode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  emit_insn (gen_aarch64_simd_ld1<VQ:mode>_x2 (operands[0], mem));
+  DONE;
+})
+
+(define_expand "aarch64_ld1x2<VDC:mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+  (match_operand:DI 1 "register_operand" "r")
+  (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  machine_mode mode = OImode;
+  rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+  emit_insn (gen_aarch64_simd_ld1<VDC:mode>_x2 (operands[0], mem));
+  DONE;
+})
+
+
 (define_expand "aarch64_ld<VSTRUCT:nregs>_lane<VALLDIF:mode>"
   [(match_operand:VSTRUCT 0 "register_operand" "=w")
 	(match_operand:DI 1 "register_operand" "w")
@@ -5188,9 +5483,7 @@
   set_mem_size (mem, GET_MODE_SIZE (GET_MODE_INNER (<VALLDIF:MODE>mode))
 		     * <VSTRUCT:nregs>);
 
-  aarch64_simd_lane_bounds (operands[3], 0,
-			    GET_MODE_NUNITS (<VALLDIF:MODE>mode),
-			    NULL);
+  aarch64_simd_lane_bounds (operands[3], 0, <VALLDIF:nunits>, NULL);
   emit_insn (gen_aarch64_vec_load_lanes<VSTRUCT:mode>_lane<VALLDIF:mode> (
 	operands[0], mem, operands[2], operands[3]));
   DONE;
@@ -5238,20 +5531,6 @@
 
 ;; vec_perm support
 
-(define_expand "vec_perm_const<mode>"
-  [(match_operand:VALL_F16 0 "register_operand")
-   (match_operand:VALL_F16 1 "register_operand")
-   (match_operand:VALL_F16 2 "register_operand")
-   (match_operand:<V_INT_EQUIV> 3)]
-  "TARGET_SIMD"
-{
-  if (aarch64_expand_vec_perm_const (operands[0], operands[1],
-				     operands[2], operands[3]))
-    DONE;
-  else
-    FAIL;
-})
-
 (define_expand "vec_perm<mode>"
   [(match_operand:VB 0 "register_operand")
    (match_operand:VB 1 "register_operand")
@@ -5260,7 +5539,7 @@
   "TARGET_SIMD"
 {
   aarch64_expand_vec_perm (operands[0], operands[1],
-			   operands[2], operands[3]);
+			   operands[2], operands[3], <nunits>);
   DONE;
 })
 
@@ -5369,6 +5648,9 @@
 [(set_attr "type" "multiple")]
 )
 
+;; This instruction's pattern is generated directly by
+;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+;; need corresponding changes there.
 (define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
@@ -5379,7 +5661,10 @@
   [(set_attr "type" "neon_permute<q>")]
 )
 
-;; Note immediate (third) operand is lane index not byte index.
+;; This instruction's pattern is generated directly by
+;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+;; need corresponding changes there.  Note that the immediate (third)
+;; operand is a lane index not a byte index.
 (define_insn "aarch64_ext<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
         (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
@@ -5395,6 +5680,9 @@
   [(set_attr "type" "neon_ext<q>")]
 )
 
+;; This instruction's pattern is generated directly by
+;; aarch64_expand_vec_perm_const, so any changes to the pattern would
+;; need corresponding changes there.
 (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
@@ -5563,25 +5851,47 @@
   [(set_attr "type" "neon_load1_all_lanes")]
 )
 
-(define_insn "aarch64_frecpe<mode>"
-  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-	(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")]
+(define_insn "aarch64_simd_ld1<mode>_x2"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+	(unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
+		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		   UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_2reg<q>")]
+)
+
+(define_insn "aarch64_simd_ld1<mode>_x2"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+	(unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
+		    (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+		   UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %T0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_2reg<q>")]
+)
+
+
+(define_insn "@aarch64_frecpe<mode>"
+  [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
+	(unspec:VHSDF_HSDF
+	 [(match_operand:VHSDF_HSDF 1 "register_operand" "w")]
 	 UNSPEC_FRECPE))]
   "TARGET_SIMD"
-  "frecpe\\t%0.<Vtype>, %1.<Vtype>"
+  "frecpe\t%<v>0<Vmtype>, %<v>1<Vmtype>"
   [(set_attr "type" "neon_fp_recpe_<stype><q>")]
 )
 
-(define_insn "aarch64_frecp<FRECP:frecp_suffix><mode>"
+(define_insn "aarch64_frecpx<mode>"
   [(set (match_operand:GPF_F16 0 "register_operand" "=w")
 	(unspec:GPF_F16 [(match_operand:GPF_F16 1 "register_operand" "w")]
-	 FRECP))]
-  "TARGET_SIMD"
-  "frecp<FRECP:frecp_suffix>\\t%<s>0, %<s>1"
-  [(set_attr "type" "neon_fp_recp<FRECP:frecp_suffix>_<GPF_F16:stype>")]
-)
-
-(define_insn "aarch64_frecps<mode>"
+	 UNSPEC_FRECPX))]
+  "TARGET_SIMD"
+  "frecpx\t%<s>0, %<s>1"
+  [(set_attr "type" "neon_fp_recpx_<GPF_F16:stype>")]
+)
+
+(define_insn "@aarch64_frecps<mode>"
   [(set (match_operand:VHSDF_HSDF 0 "register_operand" "=w")
 	(unspec:VHSDF_HSDF
 	  [(match_operand:VHSDF_HSDF 1 "register_operand" "w")
@@ -5617,10 +5927,33 @@
 
 (define_insn "aarch64_crypto_aes<aes_op>v16qi"
   [(set (match_operand:V16QI 0 "register_operand" "=w")
-        (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%0")
 		       (match_operand:V16QI 2 "register_operand" "w")]
          CRYPTO_AES))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_AES"
+  "aes<aes_op>\\t%0.16b, %2.16b"
+  [(set_attr "type" "crypto_aese")]
+)
+
+(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+	(unspec:V16QI [(xor:V16QI
+			(match_operand:V16QI 1 "register_operand" "%0")
+			(match_operand:V16QI 2 "register_operand" "w"))
+		       (match_operand:V16QI 3 "aarch64_simd_imm_zero" "")]
+		       CRYPTO_AES))]
+  "TARGET_SIMD && TARGET_AES"
+  "aes<aes_op>\\t%0.16b, %2.16b"
+  [(set_attr "type" "crypto_aese")]
+)
+
+(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+	(unspec:V16QI [(match_operand:V16QI 3 "aarch64_simd_imm_zero" "")
+	(xor:V16QI (match_operand:V16QI 1 "register_operand" "%0")
+		   (match_operand:V16QI 2 "register_operand" "w"))]
+	CRYPTO_AES))]
+  "TARGET_SIMD && TARGET_AES"
   "aes<aes_op>\\t%0.16b, %2.16b"
   [(set_attr "type" "crypto_aese")]
 )
@@ -5635,7 +5968,7 @@
   [(set (match_operand:V16QI 0 "register_operand" "=w,w")
 	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
 	 CRYPTO_AESMC))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_AES"
   "aes<aesmc_op>\\t%0.16b, %1.16b"
   [(set_attr "type" "crypto_aesmc")
    (set_attr_alternative "enabled"
@@ -5646,6 +5979,44 @@
       (const_string "yes")])]
 )
 
+;; When AESE/AESMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch64_crypto_aese_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=&w")
+	(unspec:V16QI
+	  [(unspec:V16QI
+	    [(match_operand:V16QI 1 "register_operand" "0")
+	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE)
+	  ] UNSPEC_AESMC))]
+  "TARGET_SIMD && TARGET_AES
+   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+  "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
+)
+
+;; When AESD/AESIMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch64_crypto_aesd_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=&w")
+	(unspec:V16QI
+	  [(unspec:V16QI
+	    [(match_operand:V16QI 1 "register_operand" "0")
+	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD)
+	  ] UNSPEC_AESIMC))]
+  "TARGET_SIMD && TARGET_AES
+   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+  "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
+)
+
 ;; sha1
 
 (define_insn "aarch64_crypto_sha1hsi"
@@ -5653,7 +6024,7 @@
         (unspec:SI [(match_operand:SI 1
                        "register_operand" "w")]
          UNSPEC_SHA1H))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha1h\\t%s0, %s1"
   [(set_attr "type" "crypto_sha1_fast")]
 )
@@ -5663,7 +6034,7 @@
 	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
 		     (parallel [(const_int 0)]))]
 	 UNSPEC_SHA1H))]
-  "TARGET_SIMD && TARGET_CRYPTO && !BYTES_BIG_ENDIAN"
+  "TARGET_SIMD && TARGET_SHA2 && !BYTES_BIG_ENDIAN"
   "sha1h\\t%s0, %s1"
   [(set_attr "type" "crypto_sha1_fast")]
 )
@@ -5673,7 +6044,7 @@
 	(unspec:SI [(vec_select:SI (match_operand:V4SI 1 "register_operand" "w")
 		     (parallel [(const_int 3)]))]
 	 UNSPEC_SHA1H))]
-  "TARGET_SIMD && TARGET_CRYPTO && BYTES_BIG_ENDIAN"
+  "TARGET_SIMD && TARGET_SHA2 && BYTES_BIG_ENDIAN"
   "sha1h\\t%s0, %s1"
   [(set_attr "type" "crypto_sha1_fast")]
 )
@@ -5683,7 +6054,7 @@
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
                       (match_operand:V4SI 2 "register_operand" "w")]
          UNSPEC_SHA1SU1))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha1su1\\t%0.4s, %2.4s"
   [(set_attr "type" "crypto_sha1_fast")]
 )
@@ -5694,7 +6065,7 @@
                       (match_operand:SI 2 "register_operand" "w")
                       (match_operand:V4SI 3 "register_operand" "w")]
          CRYPTO_SHA1))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha1<sha1_op>\\t%q0, %s2, %3.4s"
   [(set_attr "type" "crypto_sha1_slow")]
 )
@@ -5705,7 +6076,7 @@
                       (match_operand:V4SI 2 "register_operand" "w")
                       (match_operand:V4SI 3 "register_operand" "w")]
          UNSPEC_SHA1SU0))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha1su0\\t%0.4s, %2.4s, %3.4s"
   [(set_attr "type" "crypto_sha1_xor")]
 )
@@ -5718,7 +6089,7 @@
                       (match_operand:V4SI 2 "register_operand" "w")
                       (match_operand:V4SI 3 "register_operand" "w")]
          CRYPTO_SHA256))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha256h<sha256_op>\\t%q0, %q2, %3.4s"
   [(set_attr "type" "crypto_sha256_slow")]
 )
@@ -5728,7 +6099,7 @@
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
                       (match_operand:V4SI 2 "register_operand" "w")]
          UNSPEC_SHA256SU0))]
-  "TARGET_SIMD &&TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha256su0\\t%0.4s, %2.4s"
   [(set_attr "type" "crypto_sha256_fast")]
 )
@@ -5739,11 +6110,728 @@
                       (match_operand:V4SI 2 "register_operand" "w")
                       (match_operand:V4SI 3 "register_operand" "w")]
          UNSPEC_SHA256SU1))]
-  "TARGET_SIMD &&TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_SHA2"
   "sha256su1\\t%0.4s, %2.4s, %3.4s"
   [(set_attr "type" "crypto_sha256_slow")]
 )
 
+;; sha512
+
+(define_insn "aarch64_crypto_sha512h<sha512_op>qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V2DI 2 "register_operand" "w")
+                      (match_operand:V2DI 3 "register_operand" "w")]
+         CRYPTO_SHA512))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "sha512h<sha512_op>\\t%q0, %q2, %3.2d"
+  [(set_attr "type" "crypto_sha512")]
+)
+
+(define_insn "aarch64_crypto_sha512su0qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V2DI 2 "register_operand" "w")]
+         UNSPEC_SHA512SU0))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "sha512su0\\t%0.2d, %2.2d"
+  [(set_attr "type" "crypto_sha512")]
+)
+
+(define_insn "aarch64_crypto_sha512su1qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V2DI 2 "register_operand" "w")
+                      (match_operand:V2DI 3 "register_operand" "w")]
+         UNSPEC_SHA512SU1))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "sha512su1\\t%0.2d, %2.2d, %3.2d"
+  [(set_attr "type" "crypto_sha512")]
+)
+
+;; sha3
+
+(define_insn "eor3q<mode>4"
+  [(set (match_operand:VQ_I 0 "register_operand" "=w")
+	(xor:VQ_I
+	 (xor:VQ_I
+	  (match_operand:VQ_I 2 "register_operand" "w")
+	  (match_operand:VQ_I 3 "register_operand" "w"))
+	 (match_operand:VQ_I 1 "register_operand" "w")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "eor3\\t%0.16b, %1.16b, %2.16b, %3.16b"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+(define_insn "aarch64_rax1qv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+	(xor:V2DI
+	 (rotate:V2DI
+	  (match_operand:V2DI 2 "register_operand" "w")
+	  (const_int 1))
+	 (match_operand:V2DI 1 "register_operand" "w")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "rax1\\t%0.2d, %1.2d, %2.2d"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+(define_insn "aarch64_xarqv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=w")
+	(rotatert:V2DI
+	 (xor:V2DI
+	  (match_operand:V2DI 1 "register_operand" "%w")
+	  (match_operand:V2DI 2 "register_operand" "w"))
+	 (match_operand:SI 3 "aarch64_simd_shift_imm_di" "Usd")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "xar\\t%0.2d, %1.2d, %2.2d, %3"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+(define_insn "bcaxq<mode>4"
+  [(set (match_operand:VQ_I 0 "register_operand" "=w")
+	(xor:VQ_I
+	 (and:VQ_I
+	  (not:VQ_I (match_operand:VQ_I 3 "register_operand" "w"))
+	  (match_operand:VQ_I 2 "register_operand" "w"))
+	 (match_operand:VQ_I 1 "register_operand" "w")))]
+  "TARGET_SIMD && TARGET_SHA3"
+  "bcax\\t%0.16b, %1.16b, %2.16b, %3.16b"
+  [(set_attr "type" "crypto_sha3")]
+)
+
+;; SM3
+
+(define_insn "aarch64_sm3ss1qv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "w")
+		      (match_operand:V4SI 2 "register_operand" "w")
+		      (match_operand:V4SI 3 "register_operand" "w")]
+	 UNSPEC_SM3SS1))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm3ss1\\t%0.4s, %1.4s, %2.4s, %3.4s"
+  [(set_attr "type" "crypto_sm3")]
+)
+
+
+(define_insn "aarch64_sm3tt<sm3tt_op>qv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+		      (match_operand:V4SI 2 "register_operand" "w")
+		      (match_operand:V4SI 3 "register_operand" "w")
+		      (match_operand:SI 4 "aarch64_imm2" "Ui2")]
+	 CRYPTO_SM3TT))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm3tt<sm3tt_op>\\t%0.4s, %2.4s, %3.4s[%4]"
+  [(set_attr "type" "crypto_sm3")]
+)
+
+(define_insn "aarch64_sm3partw<sm3part_op>qv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+		      (match_operand:V4SI 2 "register_operand" "w")
+		      (match_operand:V4SI 3 "register_operand" "w")]
+	 CRYPTO_SM3PART))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm3partw<sm3part_op>\\t%0.4s, %2.4s, %3.4s"
+  [(set_attr "type" "crypto_sm3")]
+)
+
+;; SM4
+
+(define_insn "aarch64_sm4eqv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "0")
+		      (match_operand:V4SI 2 "register_operand" "w")]
+	 UNSPEC_SM4E))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm4e\\t%0.4s, %2.4s"
+  [(set_attr "type" "crypto_sm4")]
+)
+
+(define_insn "aarch64_sm4ekeyqv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(unspec:V4SI [(match_operand:V4SI 1 "register_operand" "w")
+		      (match_operand:V4SI 2 "register_operand" "w")]
+	 UNSPEC_SM4EKEY))]
+  "TARGET_SIMD && TARGET_SM4"
+  "sm4ekey\\t%0.4s, %1.4s, %2.4s"
+  [(set_attr "type" "crypto_sm4")]
+)
+
+;; fp16fml
+
+(define_expand "aarch64_fml<f16mac1>l<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "0")
+	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+  rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode,
+					    <nunits> * 2, false);
+  rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode,
+					    <nunits> * 2, false);
+
+  emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_low<mode> (operands[0],
+								operands[1],
+								operands[2],
+								operands[3],
+								p1, p2));
+  DONE;
+
+})
+
+(define_expand "aarch64_fml<f16mac1>l<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(unspec:VDQSF
+	 [(match_operand:VDQSF 1 "register_operand" "0")
+	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+  rtx p1 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, <nunits> * 2, true);
+  rtx p2 = aarch64_simd_vect_par_cnst_half (<VFMLA_W>mode, <nunits> * 2, true);
+
+  emit_insn (gen_aarch64_simd_fml<f16mac1>l<f16quad>_high<mode> (operands[0],
+								 operands[1],
+								 operands[2],
+								 operands[3],
+								 p1, p2));
+  DONE;
+})
+
+(define_insn "aarch64_simd_fmlal<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl<f16quad>_low<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (neg:<VFMLA_SEL_W>
+	   (vec_select:<VFMLA_SEL_W>
+	    (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	    (match_operand:<VFMLA_W> 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_lo_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl<f16quad>_high<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(fma:VDQSF
+	 (float_extend:VDQSF
+	  (neg:<VFMLA_SEL_W>
+	   (vec_select:<VFMLA_SEL_W>
+	    (match_operand:<VFMLA_W> 2 "register_operand" "w")
+	    (match_operand:<VFMLA_W> 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:VDQSF
+	  (vec_select:<VFMLA_SEL_W>
+	   (match_operand:<VFMLA_W> 3 "register_operand" "w")
+	   (match_operand:<VFMLA_W> 5 "vect_par_cnst_hi_half" "")))
+	 (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.<nunits>s, %2.<nunits>h, %3.<nunits>h"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>l_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+			   (match_operand:V4HF 2 "register_operand" "")
+			   (match_operand:V4HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, false);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_lowv2sf (operands[0],
+							    operands[1],
+							    operands[2],
+							    operands[3],
+							    p1, lane));
+    DONE;
+}
+)
+
+(define_expand "aarch64_fml<f16mac1>l_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+			   (match_operand:V4HF 2 "register_operand" "")
+			   (match_operand:V4HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, true);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_lane_highv2sf (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlal_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V2SF
+	   (vec_duplicate:V2HF
+	    (vec_select:HF
+	     (match_operand:V4HF 3 "register_operand" "x")
+	     (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_lane_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	  (neg:V2HF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V2SF
+	   (vec_duplicate:V2HF
+	    (vec_select:HF
+	     (match_operand:V4HF 3 "register_operand" "x")
+	     (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_lane_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (neg:V2HF
+	    (vec_select:V2HF
+	     (match_operand:V4HF 2 "register_operand" "w")
+	     (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V2SF
+	   (vec_duplicate:V2HF
+	    (vec_select:HF
+	     (match_operand:V4HF 3 "register_operand" "x")
+	     (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>lq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+			   (match_operand:V8HF 2 "register_operand" "")
+			   (match_operand:V8HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, false);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_lowv4sf (operands[0],
+							      operands[1],
+							      operands[2],
+							      operands[3],
+							      p1, lane));
+    DONE;
+})
+
+(define_expand "aarch64_fml<f16mac1>lq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+			   (match_operand:V8HF 2 "register_operand" "")
+			   (match_operand:V8HF 3 "register_operand" "")
+			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, true);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_laneq_highv4sf (operands[0],
+							       operands[1],
+							       operands[2],
+							       operands[3],
+							       p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlalq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_laneq_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	  (float_extend:V4SF
+	   (neg:V4HF
+	    (vec_select:V4HF
+	     (match_operand:V8HF 2 "register_operand" "w")
+	     (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlalq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_laneq_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (neg:V4HF
+	   (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>l_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+		      (match_operand:V4HF 2 "register_operand" "")
+		      (match_operand:V8HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, false);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_lowv2sf (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     p1, lane));
+    DONE;
+
+})
+
+(define_expand "aarch64_fml<f16mac1>l_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
+		      (match_operand:V4HF 2 "register_operand" "")
+		      (match_operand:V8HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V4HFmode, 4, true);
+    rtx lane = aarch64_endian_lane_rtx (V8HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>l_laneq_highv2sf (operands[0],
+							      operands[1],
+							      operands[2],
+							      operands[3],
+							      p1, lane));
+    DONE;
+
+})
+
+(define_insn "aarch64_simd_fmlal_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_laneq_lowv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	  (neg:V2HF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlal_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlsl_laneq_highv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=w")
+	(fma:V2SF
+	 (float_extend:V2SF
+	  (neg:V2HF
+	   (vec_select:V2HF
+	    (match_operand:V4HF 2 "register_operand" "w")
+	    (match_operand:V4HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V2SF
+	  (vec_duplicate:V2HF
+	   (vec_select:HF
+	    (match_operand:V8HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_lane_imm3" "Ui7")]))))
+	 (match_operand:V2SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.2s, %2.2h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_expand "aarch64_fml<f16mac1>lq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+		      (match_operand:V8HF 2 "register_operand" "")
+		      (match_operand:V4HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_LOW))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, false);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_lowv4sf (operands[0],
+							     operands[1],
+							     operands[2],
+							     operands[3],
+							     p1, lane));
+    DONE;
+})
+
+(define_expand "aarch64_fml<f16mac1>lq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
+		      (match_operand:V8HF 2 "register_operand" "")
+		      (match_operand:V4HF 3 "register_operand" "")
+		      (match_operand:SI 4 "aarch64_imm2" "")]
+	 VFMLA16_HIGH))]
+  "TARGET_F16FML"
+{
+    rtx p1 = aarch64_simd_vect_par_cnst_half (V8HFmode, 8, true);
+    rtx lane = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4]));
+
+    emit_insn (gen_aarch64_simd_fml<f16mac1>lq_lane_highv4sf (operands[0],
+							      operands[1],
+							      operands[2],
+							      operands[3],
+							      p1, lane));
+    DONE;
+})
+
+(define_insn "aarch64_simd_fmlalq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	   (match_operand:V8HF 2 "register_operand" "w")
+	   (match_operand:V8HF 4 "vect_par_cnst_lo_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_lane_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (neg:V4HF
+	   (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_lo_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlalq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (vec_select:V4HF
+	   (match_operand:V8HF 2 "register_operand" "w")
+	   (match_operand:V8HF 4 "vect_par_cnst_hi_half" "")))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlal2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
+(define_insn "aarch64_simd_fmlslq_lane_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(fma:V4SF
+	 (float_extend:V4SF
+	  (neg:V4HF
+	   (vec_select:V4HF
+	    (match_operand:V8HF 2 "register_operand" "w")
+	    (match_operand:V8HF 4 "vect_par_cnst_hi_half" ""))))
+	 (float_extend:V4SF
+	  (vec_duplicate:V4HF
+	   (vec_select:HF
+	    (match_operand:V4HF 3 "register_operand" "x")
+	    (parallel [(match_operand:SI 5 "aarch64_imm2" "Ui2")]))))
+	 (match_operand:V4SF 1 "register_operand" "0")))]
+  "TARGET_F16FML"
+  "fmlsl2\\t%0.4s, %2.4h, %3.h[%5]"
+  [(set_attr "type" "neon_fp_mul_s")]
+)
+
 ;; pmull
 
 (define_insn "aarch64_crypto_pmulldi"
@@ -5751,7 +6839,7 @@
         (unspec:TI  [(match_operand:DI 1 "register_operand" "w")
 		     (match_operand:DI 2 "register_operand" "w")]
 		    UNSPEC_PMULL))]
- "TARGET_SIMD && TARGET_CRYPTO"
+ "TARGET_SIMD && TARGET_AES"
  "pmull\\t%0.1q, %1.1d, %2.1d"
   [(set_attr "type" "crypto_pmull")]
 )
@@ -5761,7 +6849,7 @@
        (unspec:TI [(match_operand:V2DI 1 "register_operand" "w")
 		   (match_operand:V2DI 2 "register_operand" "w")]
 		  UNSPEC_PMULL2))]
-  "TARGET_SIMD && TARGET_CRYPTO"
+  "TARGET_SIMD && TARGET_AES"
   "pmull2\\t%0.1q, %1.2d, %2.2d"
   [(set_attr "type" "crypto_pmull")]
 )