diff gcc/config/aarch64/aarch64.md @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/config/aarch64/aarch64.md	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/config/aarch64/aarch64.md	Thu Oct 25 07:37:49 2018 +0900
@@ -1,5 +1,5 @@
 ;; Machine description for AArch64 architecture.
-;; Copyright (C) 2009-2017 Free Software Foundation, Inc.
+;; Copyright (C) 2009-2018 Free Software Foundation, Inc.
 ;; Contributed by ARM Ltd.
 ;;
 ;; This file is part of GCC.
@@ -57,12 +57,63 @@
     (LR_REGNUM		30)
     (SP_REGNUM		31)
     (V0_REGNUM		32)
+    (V1_REGNUM		33)
+    (V2_REGNUM		34)
+    (V3_REGNUM		35)
+    (V4_REGNUM		36)
+    (V5_REGNUM		37)
+    (V6_REGNUM		38)
+    (V7_REGNUM		39)
+    (V8_REGNUM		40)
+    (V9_REGNUM		41)
+    (V10_REGNUM		42)
+    (V11_REGNUM		43)
+    (V12_REGNUM		44)
+    (V13_REGNUM		45)
+    (V14_REGNUM		46)
     (V15_REGNUM		47)
+    (V16_REGNUM		48)
+    (V17_REGNUM		49)
+    (V18_REGNUM		50)
+    (V19_REGNUM		51)
+    (V20_REGNUM		52)
+    (V21_REGNUM		53)
+    (V22_REGNUM		54)
+    (V23_REGNUM		55)
+    (V24_REGNUM		56)
+    (V25_REGNUM		57)
+    (V26_REGNUM		58)
+    (V27_REGNUM		59)
+    (V28_REGNUM		60)
+    (V29_REGNUM		61)
+    (V30_REGNUM		62)
     (V31_REGNUM		63)
     (LAST_SAVED_REGNUM	63)
     (SFP_REGNUM		64)
     (AP_REGNUM		65)
     (CC_REGNUM		66)
+    ;; Defined only to make the DWARF description simpler.
+    (VG_REGNUM		67)
+    (P0_REGNUM		68)
+    (P1_REGNUM		69)
+    (P2_REGNUM		70)
+    (P3_REGNUM		71)
+    (P4_REGNUM		72)
+    (P5_REGNUM		73)
+    (P6_REGNUM		74)
+    (P7_REGNUM		75)
+    (P8_REGNUM		76)
+    (P9_REGNUM		77)
+    (P10_REGNUM		78)
+    (P11_REGNUM		79)
+    (P12_REGNUM		80)
+    (P13_REGNUM		81)
+    (P14_REGNUM		82)
+    (P15_REGNUM		83)
+    ;; A couple of call-clobbered registers that we need to reserve when
+    ;; tracking speculation this is not ABI, so is subject to change.
+    (SPECULATION_TRACKER_REGNUM 15)
+    (SPECULATION_SCRATCH_REGNUM 14)
   ]
 )
 
@@ -114,7 +165,11 @@
     UNSPEC_PACI1716
     UNSPEC_PACISP
     UNSPEC_PRLG_STK
+    UNSPEC_REV
     UNSPEC_RBIT
+    UNSPEC_SABAL
+    UNSPEC_SABDL2
+    UNSPEC_SADALP
     UNSPEC_SCVTF
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
@@ -133,6 +188,9 @@
     UNSPEC_TLSLE24
     UNSPEC_TLSLE32
     UNSPEC_TLSLE48
+    UNSPEC_UABAL
+    UNSPEC_UABDL2
+    UNSPEC_UADALP
     UNSPEC_UCVTF
     UNSPEC_USHL_2S
     UNSPEC_VSTRUCTDUMMY
@@ -143,6 +201,27 @@
     UNSPEC_RSQRTS
     UNSPEC_NZCV
     UNSPEC_XPACLRI
+    UNSPEC_LD1_SVE
+    UNSPEC_ST1_SVE
+    UNSPEC_LD1RQ
+    UNSPEC_LD1_GATHER
+    UNSPEC_ST1_SCATTER
+    UNSPEC_MERGE_PTRUE
+    UNSPEC_PTEST_PTRUE
+    UNSPEC_UNPACKSHI
+    UNSPEC_UNPACKUHI
+    UNSPEC_UNPACKSLO
+    UNSPEC_UNPACKULO
+    UNSPEC_PACK
+    UNSPEC_FLOAT_CONVERT
+    UNSPEC_WHILE_LO
+    UNSPEC_LDN
+    UNSPEC_STN
+    UNSPEC_INSR
+    UNSPEC_CLASTB
+    UNSPEC_FADDA
+    UNSPEC_REV_SUBREG
+    UNSPEC_SPECULATION_TRACKER
 ])
 
 (define_c_enum "unspecv" [
@@ -153,6 +232,7 @@
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
     UNSPECV_BLOCKAGE		; Represent a blockage
     UNSPECV_PROBE_STACK_RANGE	; Represent stack range probing.
+    UNSPECV_SPECULATION_BARRIER ; Represent speculation barrier.
   ]
 )
 
@@ -179,38 +259,54 @@
 ;; FP or SIMD registers then the pattern predicate should include TARGET_FLOAT
 ;; or TARGET_SIMD.
 
+;; Attributes of the architecture required to support the instruction (or
+;; alternative). This attribute is used to compute attribute "enabled", use type
+;; "any" to enable an alternative in all cases.
+
+(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
+
+(define_enum_attr "arch" "arches" (const_string "any"))
+
+;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
-;; registers.  When this is set to yes for an alternative, that alternative
-;; will be disabled when !TARGET_FLOAT.
-(define_attr "fp" "no,yes" (const_string "no"))
-
-;; Attribute that specifies whether or not the instruction touches half
-;; precision fp registers.  When this is set to yes for an alternative,
-;; that alternative will be disabled when !TARGET_FP_F16INST.
-(define_attr "fp16" "no,yes" (const_string "no"))
-
-;; Attribute that specifies whether or not the instruction touches simd
-;; registers.  When this is set to yes for an alternative, that alternative
-;; will be disabled when !TARGET_SIMD.
-(define_attr "simd" "no,yes" (const_string "no"))
-
-(define_attr "length" ""
-  (const_int 4))
+;; registers.
+;; Note that this attribute is not used anywhere in either the arm or aarch64
+;; backends except in the scheduling description for xgene1.  In that
+;; scheduling description this attribute is used to subclass the load_4 and
+;; load_8 types.
+(define_attr "fp" "no,yes"
+  (if_then_else
+    (eq_attr "arch" "fp")
+    (const_string "yes")
+    (const_string "no")))
+
+(define_attr "arch_enabled" "no,yes"
+  (if_then_else
+    (ior
+	(eq_attr "arch" "any")
+
+	(and (eq_attr "arch" "rcpc8_4")
+	     (match_test "AARCH64_ISA_RCPC8_4"))
+
+	(and (eq_attr "arch" "fp")
+	     (match_test "TARGET_FLOAT"))
+
+	(and (eq_attr "arch" "simd")
+	     (match_test "TARGET_SIMD"))
+
+	(and (eq_attr "arch" "fp16")
+	     (match_test "TARGET_FP_F16INST"))
+
+	(and (eq_attr "arch" "sve")
+	     (match_test "TARGET_SVE")))
+    (const_string "yes")
+    (const_string "no")))
 
 ;; Attribute that controls whether an alternative is enabled or not.
 ;; Currently it is only used to disable alternatives which touch fp or simd
-;; registers when -mgeneral-regs-only is specified.
-(define_attr "enabled" "no,yes"
-  (cond [(ior
-	    (ior
-		(and (eq_attr "fp" "yes")
-		     (eq (symbol_ref "TARGET_FLOAT") (const_int 0)))
-		(and (eq_attr "simd" "yes")
-		     (eq (symbol_ref "TARGET_SIMD") (const_int 0))))
-	    (and (eq_attr "fp16" "yes")
-		 (eq (symbol_ref "TARGET_FP_F16INST") (const_int 0))))
-	    (const_string "no")
-	] (const_string "yes")))
+;; registers when -mgeneral-regs-only is specified or to require a special
+;; architecture support.
+(define_attr "enabled" "no,yes" (attr "arch_enabled"))
 
 ;; Attribute that specifies whether we are dealing with a branch to a
 ;; label that is far away, i.e. further away than the maximum/minimum
@@ -219,10 +315,23 @@
 ;; 1 :=: yes
 (define_attr "far_branch" "" (const_int 0))
 
+;; Attribute that specifies whether the alternative uses MOVPRFX.
+(define_attr "movprfx" "no,yes" (const_string "no"))
+
+(define_attr "length" ""
+  (cond [(eq_attr "movprfx" "yes")
+           (const_int 8)
+        ] (const_int 4)))
+
 ;; Strictly for compatibility with AArch32 in pipeline models, since AArch64 has
 ;; no predicated insns.
 (define_attr "predicated" "yes,no" (const_string "no"))
 
+;; Set to true on an insn that requires the speculation tracking state to be
+;; in the tracking register before the insn issues.  Otherwise the compiler
+;; may chose to hold the tracking state encoded in SP.
+(define_attr "speculation_barrier" "true,false" (const_string "false"))
+
 ;; -------------------------------------------------------------------
 ;; Pipeline descriptions and scheduling
 ;; -------------------------------------------------------------------
@@ -626,7 +735,7 @@
 				(const_int 0))
 			   (label_ref (match_operand 1 "" ""))
 			   (pc)))]
-  ""
+  "!aarch64_track_speculation"
   {
     if (get_attr_length (insn) == 8)
       return aarch64_gen_far_branch (operands, 1, "Lcb", "<inv_cb>\\t%<w>0, ");
@@ -656,7 +765,7 @@
 	     (label_ref (match_operand 2 "" ""))
 	     (pc)))
    (clobber (reg:CC CC_REGNUM))]
-  ""
+  "!aarch64_track_speculation"
   {
     if (get_attr_length (insn) == 8)
       {
@@ -692,7 +801,7 @@
 			   (label_ref (match_operand 1 "" ""))
 			   (pc)))
    (clobber (reg:CC CC_REGNUM))]
-  ""
+  "!aarch64_track_speculation"
   {
     if (get_attr_length (insn) == 8)
       {
@@ -749,7 +858,7 @@
   ""
   "@
   blr\\t%0
-  bl\\t%a0"
+  bl\\t%c0"
   [(set_attr "type" "call, call")]
 )
 
@@ -775,7 +884,7 @@
   ""
   "@
   blr\\t%1
-  bl\\t%a1"
+  bl\\t%c1"
   [(set_attr "type" "call, call")]
 )
 
@@ -811,7 +920,7 @@
   "SIBLING_CALL_P (insn)"
   "@
    br\\t%0
-   b\\t%a0"
+   b\\t%c0"
   [(set_attr "type" "branch, branch")]
 )
 
@@ -824,7 +933,7 @@
   "SIBLING_CALL_P (insn)"
   "@
    br\\t%1
-   b\\t%a1"
+   b\\t%c1"
   [(set_attr "type" "branch, branch")]
 )
 
@@ -866,12 +975,18 @@
   "
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
       operands[1] = force_reg (<MODE>mode, operands[1]);
+
+    if (GET_CODE (operands[1]) == CONST_POLY_INT)
+      {
+	aarch64_expand_mov_immediate (operands[0], operands[1]);
+	DONE;
+      }
   "
 )
 
 (define_insn "*mov<mode>_aarch64"
-  [(set (match_operand:SHORT 0 "nonimmediate_operand" "=r,r,   *w,r,*w, m, m, r,*w,*w")
-        (match_operand:SHORT 1 "general_operand"      " r,M,D<hq>,m, m,rZ,*w,*w, r,*w"))]
+  [(set (match_operand:SHORT 0 "nonimmediate_operand" "=r,r,    w,r  ,r,w, m,m,r,w,w")
+	(match_operand:SHORT 1 "aarch64_mov_operand"  " r,M,D<hq>,Usv,m,m,rZ,w,w,r,w"))]
   "(register_operand (operands[0], <MODE>mode)
     || aarch64_reg_or_zero (operands[1], <MODE>mode))"
 {
@@ -885,26 +1000,29 @@
        return aarch64_output_scalar_simd_mov_immediate (operands[1],
 							<MODE>mode);
      case 3:
+       return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
+     case 4:
        return "ldr<size>\t%w0, %1";
-     case 4:
+     case 5:
        return "ldr\t%<size>0, %1";
-     case 5:
-       return "str<size>\t%w1, %0";
      case 6:
+       return "str<size>\t%w1, %0";
+     case 7:
        return "str\t%<size>1, %0";
-     case 7:
+     case 8:
        return "umov\t%w0, %1.<v>[0]";
-     case 8:
+     case 9:
        return "dup\t%0.<Vallxd>, %w1";
-     case 9:
+     case 10:
        return "dup\t%<Vetype>0, %1.<v>[0]";
      default:
        gcc_unreachable ();
      }
 }
-  [(set_attr "type" "mov_reg,mov_imm,neon_move,load_4,load_4,store_4,store_4,\
-                     neon_to_gp<q>,neon_from_gp<q>,neon_dup")
-   (set_attr "simd" "*,*,yes,*,*,*,*,yes,yes,yes")]
+  ;; The "mov_imm" type for CNT is just a placeholder.
+  [(set_attr "type" "mov_reg,mov_imm,neon_move,mov_imm,load_4,load_4,store_4,
+		     store_4,neon_to_gp<q>,neon_from_gp<q>,neon_dup")
+   (set_attr "arch" "*,*,simd,sve,*,*,*,*,simd,simd,simd")]
 )
 
 (define_expand "mov<mode>"
@@ -932,8 +1050,8 @@
 )
 
 (define_insn_and_split "*movsi_aarch64"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r,w, m, m,  r,  r, w,r,w, w")
-	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,m,m,rZ,*w,Usa,Ush,rZ,w,w,Ds"))]
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,k,r,r,r,r, r,w, m, m,  r,  r, w,r,w, w")
+	(match_operand:SI 1 "aarch64_mov_operand"  " r,r,k,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Ds"))]
   "(register_operand (operands[0], SImode)
     || aarch64_reg_or_zero (operands[1], SImode))"
   "@
@@ -942,11 +1060,12 @@
    mov\\t%w0, %w1
    mov\\t%w0, %1
    #
+   * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%w0, %1
    ldr\\t%s0, %1
    str\\t%w1, %0
    str\\t%s1, %0
-   adr\\t%x0, %a1
+   adr\\t%x0, %c1
    adrp\\t%x0, %A1
    fmov\\t%s0, %w1
    fmov\\t%w0, %s1
@@ -959,15 +1078,15 @@
        aarch64_expand_mov_immediate (operands[0], operands[1]);
        DONE;
     }"
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,load_4,load_4,store_4,store_4,\
-		    adr,adr,f_mcr,f_mrc,fmov,neon_move")
-   (set_attr "fp" "*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
-   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
+  ;; The "mov_imm" type for CNT is just a placeholder.
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,load_4,
+		    load_4,store_4,store_4,adr,adr,f_mcr,f_mrc,fmov,neon_move")
+   (set_attr "arch" "*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn_and_split "*movdi_aarch64"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r,w, m,m,  r,  r, w,r,w, w")
-	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, m,m,  r,  r, w,r,w, w")
+	(match_operand:DI 1 "aarch64_mov_operand"  " r,r,k,N,M,n,Usv,m,m,rZ,w,Usa,Ush,rZ,w,w,Dd"))]
   "(register_operand (operands[0], DImode)
     || aarch64_reg_or_zero (operands[1], DImode))"
   "@
@@ -977,11 +1096,12 @@
    mov\\t%x0, %1
    mov\\t%w0, %1
    #
+   * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
    ldr\\t%x0, %1
    ldr\\t%d0, %1
    str\\t%x1, %0
    str\\t%d1, %0
-   adr\\t%x0, %a1
+   adr\\t%x0, %c1
    adrp\\t%x0, %A1
    fmov\\t%d0, %x1
    fmov\\t%x0, %d1
@@ -994,10 +1114,11 @@
        aarch64_expand_mov_immediate (operands[0], operands[1]);
        DONE;
     }"
-  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,load_8,\
-                     load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,neon_move")
-   (set_attr "fp" "*,*,*,*,*,*,*,yes,*,yes,*,*,yes,yes,yes,*")
-   (set_attr "simd" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes")]
+  ;; The "mov_imm" type for CNTD is just a placeholder.
+  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
+		     load_8,load_8,store_8,store_8,adr,adr,f_mcr,f_mrc,fmov,
+		     neon_move")
+   (set_attr "arch" "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,fp,fp,fp,simd")]
 )
 
 (define_insn "insv_imm<mode>"
@@ -1018,14 +1139,22 @@
   "
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
       operands[1] = force_reg (TImode, operands[1]);
+
+    if (GET_CODE (operands[1]) == CONST_POLY_INT)
+      {
+	emit_move_insn (gen_lowpart (DImode, operands[0]),
+			gen_lowpart (DImode, operands[1]));
+	emit_move_insn (gen_highpart (DImode, operands[0]), const0_rtx);
+	DONE;
+      }
   "
 )
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=   r,w, r,w,r,m,m,w,m")
 	(match_operand:TI 1
-	 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))]
+	 "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
     || aarch64_reg_or_zero (operands[1], TImode))"
   "@
@@ -1042,8 +1171,7 @@
 		             load_16,store_16,store_16,\
                              load_16,store_16")
    (set_attr "length" "8,8,8,4,4,4,4,4,4")
-   (set_attr "simd" "*,*,*,yes,*,*,*,*,*")
-   (set_attr "fp" "*,*,*,*,*,*,*,yes,yes")]
+   (set_attr "arch" "*,*,*,simd,*,*,*,fp,fp")]
 )
 
 ;; Split a TImode register-register or register-immediate move into
@@ -1066,7 +1194,7 @@
   {
     if (!TARGET_FLOAT)
       {
-	aarch64_err_no_fpadvsimd (<MODE>mode, "code");
+	aarch64_err_no_fpadvsimd (<MODE>mode);
 	FAIL;
       }
 
@@ -1078,26 +1206,26 @@
 )
 
 (define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  ,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
   "TARGET_FLOAT && (register_operand (operands[0], HFmode)
     || aarch64_reg_or_fp_zero (operands[1], HFmode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
+   dup\\t%w0.4h, %w1
    umov\\t%w0, %1.h[0]
    mov\\t%0.h[0], %1.h[0]
    fmov\\t%h0, %1
-   * return aarch64_output_scalar_simd_mov_immediate (operands[1], SImode);
+   * return aarch64_output_scalar_simd_mov_immediate (operands[1], HImode);
    ldr\\t%h0, %1
    str\\t%h1, %0
    ldrh\\t%w0, %1
    strh\\t%w1, %0
    mov\\t%w0, %w1"
-  [(set_attr "type" "neon_move,f_mcr,neon_to_gp,neon_move,fconsts, \
+  [(set_attr "type" "neon_move,f_mcr,neon_move,neon_to_gp, neon_move,fconsts, \
 		     neon_move,f_loads,f_stores,load_4,store_4,mov_reg")
-   (set_attr "simd" "yes,*,yes,yes,*,yes,*,*,*,*,*")
-   (set_attr "fp16"   "*,yes,*,*,yes,*,*,*,*,*,*")]
+   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
 )
 
 (define_insn "*movsf_aarch64"
@@ -1121,7 +1249,7 @@
   [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconsts,neon_move,\
 		     f_loads,f_stores,load_4,store_4,mov_reg,\
 		     fconsts")
-   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
+   (set_attr "arch" "simd,*,*,*,*,simd,*,*,*,*,*,*")]
 )
 
 (define_insn "*movdf_aarch64"
@@ -1145,7 +1273,7 @@
   [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
 		     f_loadd,f_stored,load_8,store_8,mov_reg,\
 		     fconstd")
-   (set_attr "simd" "yes,*,*,*,*,yes,*,*,*,*,*,*")]
+   (set_attr "arch" "simd,*,*,*,*,simd,*,*,*,*,*,*")]
 )
 
 (define_split
@@ -1190,7 +1318,7 @@
   [(set_attr "type" "logic_reg,multiple,f_mcr,f_mrc,neon_move_q,f_mcr,\
                      f_loadd,f_stored,load_16,store_16,store_16")
    (set_attr "length" "4,8,8,8,4,4,4,4,4,4,4")
-   (set_attr "simd" "yes,*,*,*,yes,*,*,*,*,*,*")]
+   (set_attr "arch" "simd,*,*,*,simd,*,*,*,*,*,*")]
 )
 
 (define_split
@@ -1224,139 +1352,72 @@
 
 ;; Operands 1 and 3 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
-(define_insn "load_pairsi"
-  [(set (match_operand:SI 0 "register_operand" "=r,*w")
-	(match_operand:SI 1 "aarch64_mem_pair_operand" "Ump,Ump"))
-   (set (match_operand:SI 2 "register_operand" "=r,*w")
-	(match_operand:SI 3 "memory_operand" "m,m"))]
-  "rtx_equal_p (XEXP (operands[3], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[1], 0),
-			       GET_MODE_SIZE (SImode)))"
+(define_insn "load_pair_sw_<SX:mode><SX2:mode>"
+  [(set (match_operand:SX 0 "register_operand" "=r,w")
+	(match_operand:SX 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+   (set (match_operand:SX2 2 "register_operand" "=r,w")
+	(match_operand:SX2 3 "memory_operand" "m,m"))]
+   "rtx_equal_p (XEXP (operands[3], 0),
+		 plus_constant (Pmode,
+				XEXP (operands[1], 0),
+				GET_MODE_SIZE (<SX:MODE>mode)))"
   "@
    ldp\\t%w0, %w2, %1
    ldp\\t%s0, %s2, %1"
   [(set_attr "type" "load_8,neon_load1_2reg")
-   (set_attr "fp" "*,yes")]
-)
-
-(define_insn "load_pairdi"
-  [(set (match_operand:DI 0 "register_operand" "=r,*w")
-	(match_operand:DI 1 "aarch64_mem_pair_operand" "Ump,Ump"))
-   (set (match_operand:DI 2 "register_operand" "=r,*w")
-	(match_operand:DI 3 "memory_operand" "m,m"))]
-  "rtx_equal_p (XEXP (operands[3], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[1], 0),
-			       GET_MODE_SIZE (DImode)))"
+   (set_attr "arch" "*,fp")]
+)
+
+;; Storing different modes that can still be merged
+(define_insn "load_pair_dw_<DX:mode><DX2:mode>"
+  [(set (match_operand:DX 0 "register_operand" "=r,w")
+	(match_operand:DX 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+   (set (match_operand:DX2 2 "register_operand" "=r,w")
+	(match_operand:DX2 3 "memory_operand" "m,m"))]
+   "rtx_equal_p (XEXP (operands[3], 0),
+		 plus_constant (Pmode,
+				XEXP (operands[1], 0),
+				GET_MODE_SIZE (<DX:MODE>mode)))"
   "@
    ldp\\t%x0, %x2, %1
    ldp\\t%d0, %d2, %1"
   [(set_attr "type" "load_16,neon_load1_2reg")
-   (set_attr "fp" "*,yes")]
-)
-
+   (set_attr "arch" "*,fp")]
+)
 
 ;; Operands 0 and 2 are tied together by the final condition; so we allow
 ;; fairly lax checking on the second memory operation.
-(define_insn "store_pairsi"
-  [(set (match_operand:SI 0 "aarch64_mem_pair_operand" "=Ump,Ump")
-	(match_operand:SI 1 "aarch64_reg_or_zero" "rZ,*w"))
-   (set (match_operand:SI 2 "memory_operand" "=m,m")
-	(match_operand:SI 3 "aarch64_reg_or_zero" "rZ,*w"))]
-  "rtx_equal_p (XEXP (operands[2], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[0], 0),
-			       GET_MODE_SIZE (SImode)))"
+(define_insn "store_pair_sw_<SX:mode><SX2:mode>"
+  [(set (match_operand:SX 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+	(match_operand:SX 1 "aarch64_reg_zero_or_fp_zero" "rYZ,w"))
+   (set (match_operand:SX2 2 "memory_operand" "=m,m")
+	(match_operand:SX2 3 "aarch64_reg_zero_or_fp_zero" "rYZ,w"))]
+   "rtx_equal_p (XEXP (operands[2], 0),
+		 plus_constant (Pmode,
+				XEXP (operands[0], 0),
+				GET_MODE_SIZE (<SX:MODE>mode)))"
   "@
    stp\\t%w1, %w3, %0
    stp\\t%s1, %s3, %0"
   [(set_attr "type" "store_8,neon_store1_2reg")
-   (set_attr "fp" "*,yes")]
-)
-
-(define_insn "store_pairdi"
-  [(set (match_operand:DI 0 "aarch64_mem_pair_operand" "=Ump,Ump")
-	(match_operand:DI 1 "aarch64_reg_or_zero" "rZ,*w"))
-   (set (match_operand:DI 2 "memory_operand" "=m,m")
-	(match_operand:DI 3 "aarch64_reg_or_zero" "rZ,*w"))]
-  "rtx_equal_p (XEXP (operands[2], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[0], 0),
-			       GET_MODE_SIZE (DImode)))"
+   (set_attr "arch" "*,fp")]
+)
+
+;; Storing different modes that can still be merged
+(define_insn "store_pair_dw_<DX:mode><DX2:mode>"
+  [(set (match_operand:DX 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+	(match_operand:DX 1 "aarch64_reg_zero_or_fp_zero" "rYZ,w"))
+   (set (match_operand:DX2 2 "memory_operand" "=m,m")
+	(match_operand:DX2 3 "aarch64_reg_zero_or_fp_zero" "rYZ,w"))]
+   "rtx_equal_p (XEXP (operands[2], 0),
+		 plus_constant (Pmode,
+				XEXP (operands[0], 0),
+				GET_MODE_SIZE (<DX:MODE>mode)))"
   "@
    stp\\t%x1, %x3, %0
    stp\\t%d1, %d3, %0"
   [(set_attr "type" "store_16,neon_store1_2reg")
-   (set_attr "fp" "*,yes")]
-)
-
-;; Operands 1 and 3 are tied together by the final condition; so we allow
-;; fairly lax checking on the second memory operation.
-(define_insn "load_pairsf"
-  [(set (match_operand:SF 0 "register_operand" "=w,*r")
-	(match_operand:SF 1 "aarch64_mem_pair_operand" "Ump,Ump"))
-   (set (match_operand:SF 2 "register_operand" "=w,*r")
-	(match_operand:SF 3 "memory_operand" "m,m"))]
-  "rtx_equal_p (XEXP (operands[3], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[1], 0),
-			       GET_MODE_SIZE (SFmode)))"
-  "@
-   ldp\\t%s0, %s2, %1
-   ldp\\t%w0, %w2, %1"
-  [(set_attr "type" "neon_load1_2reg,load_8")
-   (set_attr "fp" "yes,*")]
-)
-
-(define_insn "load_pairdf"
-  [(set (match_operand:DF 0 "register_operand" "=w,*r")
-	(match_operand:DF 1 "aarch64_mem_pair_operand" "Ump,Ump"))
-   (set (match_operand:DF 2 "register_operand" "=w,*r")
-	(match_operand:DF 3 "memory_operand" "m,m"))]
-  "rtx_equal_p (XEXP (operands[3], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[1], 0),
-			       GET_MODE_SIZE (DFmode)))"
-  "@
-   ldp\\t%d0, %d2, %1
-   ldp\\t%x0, %x2, %1"
-  [(set_attr "type" "neon_load1_2reg,load_16")
-   (set_attr "fp" "yes,*")]
-)
-
-;; Operands 0 and 2 are tied together by the final condition; so we allow
-;; fairly lax checking on the second memory operation.
-(define_insn "store_pairsf"
-  [(set (match_operand:SF 0 "aarch64_mem_pair_operand" "=Ump,Ump")
-	(match_operand:SF 1 "aarch64_reg_or_fp_zero" "w,*rY"))
-   (set (match_operand:SF 2 "memory_operand" "=m,m")
-	(match_operand:SF 3 "aarch64_reg_or_fp_zero" "w,*rY"))]
-  "rtx_equal_p (XEXP (operands[2], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[0], 0),
-			       GET_MODE_SIZE (SFmode)))"
-  "@
-   stp\\t%s1, %s3, %0
-   stp\\t%w1, %w3, %0"
-  [(set_attr "type" "neon_store1_2reg,store_8")
-   (set_attr "fp" "yes,*")]
-)
-
-(define_insn "store_pairdf"
-  [(set (match_operand:DF 0 "aarch64_mem_pair_operand" "=Ump,Ump")
-	(match_operand:DF 1 "aarch64_reg_or_fp_zero" "w,*rY"))
-   (set (match_operand:DF 2 "memory_operand" "=m,m")
-	(match_operand:DF 3 "aarch64_reg_or_fp_zero" "w,*rY"))]
-  "rtx_equal_p (XEXP (operands[2], 0),
-		plus_constant (Pmode,
-			       XEXP (operands[0], 0),
-			       GET_MODE_SIZE (DFmode)))"
-  "@
-   stp\\t%d1, %d3, %0
-   stp\\t%x1, %x3, %0"
-  [(set_attr "type" "neon_store1_2reg,store_16")
-   (set_attr "fp" "yes,*")]
+   (set_attr "arch" "*,fp")]
 )
 
 ;; Load pair with post-index writeback.  This is primarily used in function
@@ -1459,26 +1520,34 @@
 )
 
 (define_insn "*zero_extendsidi2_aarch64"
-  [(set (match_operand:DI 0 "register_operand" "=r,r")
-        (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
+  [(set (match_operand:DI 0 "register_operand" "=r,r,w,w,r,w")
+        (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "r,m,r,m,w,w")))]
   ""
   "@
    uxtw\t%0, %w1
-   ldr\t%w0, %1"
-  [(set_attr "type" "extend,load_4")]
+   ldr\t%w0, %1
+   fmov\t%s0, %w1
+   ldr\t%s0, %1
+   fmov\t%w0, %s1
+   fmov\t%s0, %s1"
+  [(set_attr "type" "extend,load_4,f_mcr,f_loads,f_mrc,fmov")
+   (set_attr "arch" "*,*,fp,fp,fp,fp")]
 )
 
 (define_insn "*load_pair_zero_extendsidi2_aarch64"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump")))
-   (set (match_operand:DI 2 "register_operand" "=r")
-	(zero_extend:DI (match_operand:SI 3 "memory_operand" "m")))]
+  [(set (match_operand:DI 0 "register_operand" "=r,w")
+	(zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump,Ump")))
+   (set (match_operand:DI 2 "register_operand" "=r,w")
+	(zero_extend:DI (match_operand:SI 3 "memory_operand" "m,m")))]
   "rtx_equal_p (XEXP (operands[3], 0),
 		plus_constant (Pmode,
 			       XEXP (operands[1], 0),
 			       GET_MODE_SIZE (SImode)))"
-  "ldp\\t%w0, %w2, %1"
-  [(set_attr "type" "load_8")]
+  "@
+   ldp\t%w0, %w2, %1
+   ldp\t%s0, %s2, %1"
+  [(set_attr "type" "load_8,neon_load1_2reg")
+   (set_attr "arch" "*,fp")]
 )
 
 (define_expand "<ANY_EXTEND:optab><SHORT:mode><GPI:mode>2"
@@ -1498,14 +1567,15 @@
 )
 
 (define_insn "*zero_extend<SHORT:mode><GPI:mode>2_aarch64"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,*w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w")
         (zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand" "r,m,m")))]
   ""
   "@
    and\t%<GPI:w>0, %<GPI:w>1, <SHORT:short_mask>
    ldr<SHORT:size>\t%w0, %1
    ldr\t%<SHORT:size>0, %1"
-  [(set_attr "type" "logic_imm,load_4,load_4")]
+  [(set_attr "type" "logic_imm,load_4,f_loads")
+   (set_attr "arch" "*,*,fp")]
 )
 
 (define_expand "<optab>qihi2"
@@ -1542,7 +1612,7 @@
   [(set
     (match_operand:GPI 0 "register_operand" "")
     (plus:GPI (match_operand:GPI 1 "register_operand" "")
-	      (match_operand:GPI 2 "aarch64_pluslong_operand" "")))]
+	      (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "")))]
   ""
 {
   /* If operands[1] is a subreg extract the inner RTX.  */
@@ -1555,23 +1625,34 @@
       && (!REG_P (op1)
 	 || !REGNO_PTR_FRAME_P (REGNO (op1))))
     operands[2] = force_reg (<MODE>mode, operands[2]);
+  /* Expand polynomial additions now if the destination is the stack
+     pointer, since we don't want to use that as a temporary.  */
+  else if (operands[0] == stack_pointer_rtx
+	   && aarch64_split_add_offset_immediate (operands[2], <MODE>mode))
+    {
+      aarch64_split_add_offset (<MODE>mode, operands[0], operands[1],
+				operands[2], NULL_RTX, NULL_RTX);
+      DONE;
+    }
 })
 
 (define_insn "*add<mode>3_aarch64"
   [(set
-    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r")
+    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,rk")
     (plus:GPI
-     (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk")
-     (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Upl")))]
+     (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,rk")
+     (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uav")))]
   ""
   "@
   add\\t%<w>0, %<w>1, %2
   add\\t%<w>0, %<w>1, %<w>2
   add\\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>
   sub\\t%<w>0, %<w>1, #%n2
-  #"
-  [(set_attr "type" "alu_imm,alu_sreg,neon_add,alu_imm,multiple")
-   (set_attr "simd" "*,*,yes,*,*")]
+  #
+  * return aarch64_output_sve_addvl_addpl (operands[0], operands[1], operands[2]);"
+  ;; The "alu_imm" type for ADDVL/ADDPL is just a placeholder.
+  [(set_attr "type" "alu_imm,alu_sreg,neon_add,alu_imm,multiple,alu_imm")
+   (set_attr "arch" "*,*,simd,*,*,*")]
 )
 
 ;; zero_extend version of above
@@ -1580,7 +1661,7 @@
     (match_operand:DI 0 "register_operand" "=rk,rk,rk,r")
     (zero_extend:DI
      (plus:SI (match_operand:SI 1 "register_operand" "%rk,rk,rk,rk")
-              (match_operand:SI 2 "aarch64_pluslong_operand" "I,r,J,Upl"))))]
+	      (match_operand:SI 2 "aarch64_pluslong_operand" "I,r,J,Uaa"))))]
   ""
   "@
   add\\t%w0, %w1, %2
@@ -1633,6 +1714,48 @@
   }
 )
 
+;; Match addition of polynomial offsets that require one temporary, for which
+;; we can use the early-clobbered destination register.  This is a separate
+;; pattern so that the early clobber doesn't affect register allocation
+;; for other forms of addition.  However, we still need to provide an
+;; all-register alternative, in case the offset goes out of range after
+;; elimination.  For completeness we might as well provide all GPR-based
+;; alternatives from the main pattern.
+;;
+;; We don't have a pattern for additions requiring two temporaries since at
+;; present LRA doesn't allow new scratches to be added during elimination.
+;; Such offsets should be rare anyway.
+;;
+;; ??? But if we added LRA support for new scratches, much of the ugliness
+;; here would go away.  We could just handle all polynomial constants in
+;; this pattern.
+(define_insn_and_split "*add<mode>3_poly_1"
+  [(set
+    (match_operand:GPI 0 "register_operand" "=r,r,r,r,r,&r")
+    (plus:GPI
+     (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,rk,rk")
+     (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uav,Uat")))]
+  "TARGET_SVE && operands[0] != stack_pointer_rtx"
+  "@
+  add\\t%<w>0, %<w>1, %2
+  add\\t%<w>0, %<w>1, %<w>2
+  sub\\t%<w>0, %<w>1, #%n2
+  #
+  * return aarch64_output_sve_addvl_addpl (operands[0], operands[1], operands[2]);
+  #"
+  "&& epilogue_completed
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && aarch64_split_add_offset_immediate (operands[2], <MODE>mode)"
+  [(const_int 0)]
+  {
+    aarch64_split_add_offset (<MODE>mode, operands[0], operands[1],
+			      operands[2], operands[0], NULL_RTX);
+    DONE;
+  }
+  ;; The "alu_imm" type for ADDVL/ADDPL is just a placeholder.
+  [(set_attr "type" "alu_imm,alu_sreg,alu_imm,multiple,alu_imm,multiple")]
+)
+
 (define_split
   [(set (match_operand:DI 0 "register_operand")
 	(zero_extend:DI
@@ -1651,25 +1774,133 @@
   }
 )
 
+(define_expand "addv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "uaddv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+
+  DONE;
+})
+
 (define_expand "addti3"
   [(set (match_operand:TI 0 "register_operand" "")
 	(plus:TI (match_operand:TI 1 "register_operand" "")
-		 (match_operand:TI 2 "register_operand" "")))]
+		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
+  ""
+{
+  rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+
+  aarch64_addti_scratch_regs (operands[1], operands[2],
+			      &low_dest, &op1_low, &op2_low,
+			      &high_dest, &op1_high, &op2_high);
+
+  if (op2_low == const0_rtx)
+    {
+      low_dest = op1_low;
+      if (!aarch64_pluslong_operand (op2_high, DImode))
+	op2_high = force_reg (DImode, op2_high);
+      emit_insn (gen_adddi3 (high_dest, op1_high, op2_high));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (low_dest, op1_low,
+				      force_reg (DImode, op2_low)));
+      emit_insn (gen_adddi3_carryin (high_dest, op1_high,
+				     force_reg (DImode, op2_high)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), low_dest);
+  emit_move_insn (gen_highpart (DImode, operands[0]), high_dest);
+
+  DONE;
+})
+
+(define_expand "addvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (label_ref (match_operand 3 "" ""))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
-
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
-
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+
+  aarch64_addti_scratch_regs (operands[1], operands[2],
+			      &low_dest, &op1_low, &op2_low,
+			      &high_dest, &op1_high, &op2_high);
+
+  if (op2_low == const0_rtx)
+    {
+      low_dest = op1_low;
+      emit_insn (gen_adddi3_compareV (high_dest, op1_high,
+				      force_reg (DImode, op2_high)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (low_dest, op1_low,
+				      force_reg (DImode, op2_low)));
+      emit_insn (gen_adddi3_carryinV (high_dest, op1_high,
+				      force_reg (DImode, op2_high)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), low_dest);
+  emit_move_insn (gen_highpart (DImode, operands[0]), high_dest);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
   DONE;
 })
 
+(define_expand "uaddvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+
+  aarch64_addti_scratch_regs (operands[1], operands[2],
+			      &low_dest, &op1_low, &op2_low,
+			      &high_dest, &op1_high, &op2_high);
+
+  if (op2_low == const0_rtx)
+    {
+      low_dest = op1_low;
+      emit_insn (gen_adddi3_compareC (high_dest, op1_high,
+				      force_reg (DImode, op2_high)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (low_dest, op1_low,
+				      force_reg (DImode, op2_low)));
+      emit_insn (gen_adddi3_carryinC (high_dest, op1_high,
+				      force_reg (DImode, op2_high)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), low_dest);
+  emit_move_insn (gen_highpart (DImode, operands[0]), high_dest);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+  DONE;
+ })
+
 (define_insn "add<mode>3_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -1755,7 +1986,7 @@
 
 (define_insn "add<mode>3_compareC"
   [(set (reg:CC_C CC_REGNUM)
-	(ne:CC_C
+	(compare:CC_C
 	  (plus:<DWI>
 	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
 	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
@@ -1768,10 +1999,71 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+(define_insn "*add<mode>3_compareV_cconly_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
+	    (match_operand:<DWI> 1 "const_scalar_int_operand" ""))
+	  (sign_extend:<DWI>
+	   (plus:GPI
+	    (match_dup 0)
+	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))))]
+  "INTVAL (operands[1]) == INTVAL (operands[2])"
+  "@
+  cmn\\t%<w>0, %<w>1
+  cmp\\t%<w>0, #%n1"
+  [(set_attr "type" "alus_imm")]
+)
+
+(define_insn "*add<mode>3_compareV_cconly"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "cmn\\t%<w>0, %<w>1"
+  [(set_attr "type" "alus_sreg")]
+)
+
+(define_insn "*add<mode>3_compareV_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r,r"))
+	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI>
+	    (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+   ""
+   "@
+   adds\\t%<w>0, %<w>1, %<w>2
+   subs\\t%<w>0, %<w>1, #%n2"
+  [(set_attr "type" "alus_imm,alus_imm")]
+)
+
+(define_insn "add<mode>3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+  ""
+  "adds\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "alus_sreg")]
+)
+
 (define_insn "*adds_shift_imm_<mode>"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
-	 (plus:GPI (ASHIFT:GPI 
+	 (plus:GPI (ASHIFT:GPI
 		    (match_operand:GPI 1 "register_operand" "r")
 		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
 		   (match_operand:GPI 3 "register_operand" "r"))
@@ -2138,6 +2430,138 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "add<mode>3_carryinC"
+  [(parallel
+     [(set (match_dup 3)
+	   (compare:CC_C
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 4)
+		 (zero_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "")))
+	       (zero_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "")))
+	   (zero_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 5) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 5) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
+  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinC_zero"
+  [(set (reg:CC_C CC_REGNUM)
+	(compare:CC_C
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinC"
+  [(set (reg:CC_C CC_REGNUM)
+	(compare:CC_C
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_expand "add<mode>3_carryinV"
+  [(parallel
+     [(set (reg:CC_V CC_REGNUM)
+	   (compare:CC_V
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 3)
+		 (sign_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "")))
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "")))
+	   (sign_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 4) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 4) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
+  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinV_zero"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinV"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*add_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(plus:GPI (and:GPI
@@ -2231,25 +2655,90 @@
    sub\\t%x0, %x1, %x2
    sub\\t%d0, %d1, %d2"
   [(set_attr "type" "alu_sreg, neon_sub")
-   (set_attr "simd" "*,yes")]
-)
+   (set_attr "arch" "*,simd")]
+)
+
+(define_expand "subv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "usubv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
+
+  DONE;
+})
 
 (define_expand "subti3"
   [(set (match_operand:TI 0 "register_operand" "")
-	(minus:TI (match_operand:TI 1 "register_operand" "")
+	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
 		  (match_operand:TI 2 "register_operand" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
-
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
-
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+
+  aarch64_subvti_scratch_regs (operands[1], operands[2],
+			       &low_dest, &op1_low, &op2_low,
+			       &high_dest, &op1_high, &op2_high);
+
+  emit_insn (gen_subdi3_compare1 (low_dest, op1_low, op2_low));
+  emit_insn (gen_subdi3_carryin (high_dest, op1_high, op2_high));
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), low_dest);
+  emit_move_insn (gen_highpart (DImode, operands[0]), high_dest);
+  DONE;
+})
+
+(define_expand "subvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+
+  aarch64_subvti_scratch_regs (operands[1], operands[2],
+			       &low_dest, &op1_low, &op2_low,
+			       &high_dest, &op1_high, &op2_high);
+  aarch64_expand_subvti (operands[0], low_dest, op1_low, op2_low,
+			 high_dest, op1_high, op2_high);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+(define_expand "usubvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (label_ref (match_operand 3 "" ""))]
+  ""
+{
+  rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
+
+  aarch64_subvti_scratch_regs (operands[1], operands[2],
+				    &low_dest, &op1_low, &op2_low,
+			       &high_dest, &op1_high, &op2_high);
+  aarch64_expand_subvti (operands[0], low_dest, op1_low, op2_low,
+			 high_dest, op1_high, op2_high);
+
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
   DONE;
 })
 
@@ -2278,6 +2767,22 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+(define_insn "*sub<mode>3_compare1_imm"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
+	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
+  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
+  "@
+  subs\\t%<w>0, %<w>1, #%n3
+  adds\\t%<w>0, %<w>1, %3"
+  [(set_attr "type" "alus_imm")]
+)
+
 (define_insn "sub<mode>3_compare1"
   [(set (reg:CC CC_REGNUM)
 	(compare:CC
@@ -2321,6 +2826,26 @@
   }
 )
 
+;; Same as the above peephole but with the compare and minus in
+;; swapped order.  The restriction on overlap between operand 0
+;; and operands 1 and 2 doesn't apply here.
+(define_peephole2
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "aarch64_reg_or_zero")
+	  (match_operand:GPI 2 "aarch64_reg_or_zero")))
+   (set (match_operand:GPI 0 "register_operand")
+	(minus:GPI (match_dup 1)
+		   (match_dup 2)))]
+  ""
+  [(const_int 0)]
+  {
+    emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
+					 operands[2]));
+    DONE;
+  }
+)
+
 (define_peephole2
   [(set (match_operand:GPI 0 "register_operand")
 	(plus:GPI (match_operand:GPI 1 "register_operand")
@@ -2339,6 +2864,26 @@
   }
 )
 
+;; Same as the above peephole but with the compare and minus in
+;; swapped order.  The restriction on overlap between operand 0
+;; and operands 1 doesn't apply here.
+(define_peephole2
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "register_operand")
+	  (match_operand:GPI 3 "const_int_operand")))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 1)
+		  (match_operand:GPI 2 "aarch64_sub_immediate")))]
+  "INTVAL (operands[3]) == -INTVAL (operands[2])"
+  [(const_int 0)]
+  {
+    emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+					 operands[2], operands[3]));
+    DONE;
+  }
+)
+
 (define_insn "*sub_<shift>_<mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(minus:GPI (match_operand:GPI 3 "register_operand" "r")
@@ -2554,6 +3099,85 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "sub<mode>3_carryinCV"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+	   (compare:CC
+	     (sign_extend:<DWI>
+	       (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
+	     (plus:<DWI>
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" ""))
+	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
+      (set (match_operand:GPI 0 "register_operand" "")
+	   (minus:GPI
+	     (minus:GPI (match_dup 1) (match_dup 2))
+	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+   ""
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r"))
+	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (neg:GPI (match_dup 1))
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, %<w>1"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 2 "register_operand" "r"))
+	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (minus:GPI (match_dup 1) (match_dup 2))
+	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*sub_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
@@ -2638,7 +3262,7 @@
    neg\\t%<w>0, %<w>1
    neg\\t%<rtn>0<vas>, %<rtn>1<vas>"
   [(set_attr "type" "alu_sreg, neon_neg<q>")
-   (set_attr "simd" "*,yes")]
+   (set_attr "arch" "*,simd")]
 )
 
 ;; zero_extend version of above
@@ -2963,7 +3587,7 @@
 
 (define_insn "cmp<mode>"
   [(set (reg:CC CC_REGNUM)
-	(compare:CC (match_operand:GPI 0 "register_operand" "r,r,r")
+	(compare:CC (match_operand:GPI 0 "register_operand" "rk,rk,rk")
 		    (match_operand:GPI 1 "aarch64_plus_operand" "r,I,J")))]
   ""
   "@
@@ -3092,7 +3716,8 @@
 (define_insn_and_split "*compare_cstore<mode>_insn"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	 (EQL:GPI (match_operand:GPI 1 "register_operand" "r")
-		  (match_operand:GPI 2 "aarch64_imm24" "n")))]
+		  (match_operand:GPI 2 "aarch64_imm24" "n")))
+   (clobber (reg:CC CC_REGNUM))]
   "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode)
    && !aarch64_plus_operand (operands[2], <MODE>mode)
    && !reload_completed"
@@ -3328,7 +3953,7 @@
          CRC))]
   "TARGET_CRC32"
   {
-    if (GET_MODE_BITSIZE (GET_MODE (operands[2])) >= 64)
+    if (GET_MODE_BITSIZE (<crc_mode>mode) >= 64)
       return "<crc_variant>\\t%w0, %w1, %x2";
     else
       return "<crc_variant>\\t%w0, %w1, %w2";
@@ -3391,6 +4016,63 @@
   [(set_attr "type" "csel")]
 )
 
+;; If X can be loaded by a single CNT[BHWD] instruction,
+;;
+;;    A = UMAX (B, X)
+;;
+;; is equivalent to:
+;;
+;;    TMP = UQDEC[BHWD] (B, X)
+;;    A = TMP + X
+;;
+;; Defining the pattern this way means that:
+;;
+;;    A = UMAX (B, X) - X
+;;
+;; becomes:
+;;
+;;    TMP1 = UQDEC[BHWD] (B, X)
+;;    TMP2 = TMP1 + X
+;;    A = TMP2 - X
+;;
+;; which combine can optimize to:
+;;
+;;    A = UQDEC[BHWD] (B, X)
+;;
+;; We don't use match_operand predicates because the order of the operands
+;; can vary: the CNT[BHWD] constant will come first if the other operand is
+;; a simpler constant (such as a CONST_INT), otherwise it will come second.
+(define_expand "umax<mode>3"
+  [(set (match_operand:GPI 0 "register_operand")
+	(umax:GPI (match_operand:GPI 1 "")
+		  (match_operand:GPI 2 "")))]
+  "TARGET_SVE"
+  {
+    if (aarch64_sve_cnt_immediate (operands[1], <MODE>mode))
+      std::swap (operands[1], operands[2]);
+    else if (!aarch64_sve_cnt_immediate (operands[2], <MODE>mode))
+      FAIL;
+    rtx temp = gen_reg_rtx (<MODE>mode);
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+    emit_insn (gen_aarch64_uqdec<mode> (temp, operands[1], operands[2]));
+    emit_insn (gen_add<mode>3 (operands[0], temp, operands[2]));
+    DONE;
+  }
+)
+
+;; Saturating unsigned subtraction of a CNT[BHWD] immediate.
+(define_insn "aarch64_uqdec<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	 (umax:GPI (match_operand:GPI 1 "register_operand" "0")
+		   (match_operand:GPI 2 "aarch64_sve_cnt_immediate" "Usv"))
+	 (match_dup 2)))]
+  "TARGET_SVE"
+  {
+    return aarch64_output_sve_cnt_immediate ("uqdec", "%<w>0", operands[2]);
+  }
+)
+
 ;; -------------------------------------------------------------------
 ;; Logical operations
 ;; -------------------------------------------------------------------
@@ -3425,7 +4107,7 @@
   <logical>\\t%<w>0, %<w>1, %2
   <logical>\\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>"
   [(set_attr "type" "logic_reg,logic_imm,neon_logic")
-   (set_attr "simd" "*,*,yes")]
+   (set_attr "arch" "*,*,simd")]
 )
 
 ;; zero_extend version of above
@@ -3559,7 +4241,7 @@
   mvn\\t%<w>0, %<w>1
   mvn\\t%0.8b, %1.8b"
   [(set_attr "type" "logic_reg,neon_logic")
-   (set_attr "simd" "*,yes")]
+   (set_attr "arch" "*,simd")]
 )
 
 (define_insn "*one_cmpl_<optab><mode>2"
@@ -3582,7 +4264,7 @@
   <NLOGICAL:nlogical>\\t%<w>0, %<w>2, %<w>1
   <NLOGICAL:nlogical>\\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype>"
   [(set_attr "type" "logic_reg,neon_logic")
-   (set_attr "simd" "*,yes")]
+   (set_attr "arch" "*,simd")]
 )
 
 (define_insn "*<NLOGICAL:optab>_one_cmplsidi3_ze"
@@ -3622,7 +4304,7 @@
    (set (match_dup 0) (not:GPI (match_dup 0)))]
   ""
   [(set_attr "type" "logic_reg,multiple")
-   (set_attr "simd" "*,yes")]
+   (set_attr "arch" "*,simd")]
 )
 
 (define_insn "*and_one_cmpl<mode>3_compare0"
@@ -3958,7 +4640,7 @@
 (define_expand "<optab><mode>3"
   [(set (match_operand:GPI 0 "register_operand")
 	(ASHIFT:GPI (match_operand:GPI 1 "register_operand")
-		    (match_operand:QI 2 "nonmemory_operand")))]
+		    (match_operand:QI 2 "aarch64_reg_or_imm")))]
   ""
   {
     if (CONST_INT_P (operands[2]))
@@ -3994,7 +4676,7 @@
 (define_expand "rotr<mode>3"
   [(set (match_operand:GPI 0 "register_operand")
 	(rotatert:GPI (match_operand:GPI 1 "register_operand")
-		      (match_operand:QI 2 "nonmemory_operand")))]
+		      (match_operand:QI 2 "aarch64_reg_or_imm")))]
   ""
   {
     if (CONST_INT_P (operands[2]))
@@ -4014,7 +4696,7 @@
 (define_expand "rotl<mode>3"
   [(set (match_operand:GPI 0 "register_operand")
 	(rotatert:GPI (match_operand:GPI 1 "register_operand")
-		      (match_operand:QI 2 "nonmemory_operand")))]
+		      (match_operand:QI 2 "aarch64_reg_or_imm")))]
   ""
   {
     /* (SZ - cnt) % SZ == -cnt % SZ */
@@ -4054,7 +4736,7 @@
   [(set_attr "type" "shift_reg")]
 )
 
-(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
+(define_insn_and_split "*aarch64_<optab>_reg_<mode>3_neg_mask2"
   [(set (match_operand:GPI 0 "register_operand" "=&r")
 	(SHIFT:GPI
 	  (match_operand:GPI 1 "register_operand" "r")
@@ -4067,7 +4749,7 @@
   [(const_int 0)]
   {
     rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
-	       : operands[0]);
+	       : lowpart_subreg (SImode, operands[0], <MODE>mode));
     emit_insn (gen_negsi2 (tmp, operands[2]));
 
     rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
@@ -4078,7 +4760,7 @@
   }
 )
 
-(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
+(define_insn_and_split "*aarch64_ashl_reg_<mode>3_minus_mask"
   [(set (match_operand:GPI 0 "register_operand" "=&r")
 	(ashift:GPI
 	  (match_operand:GPI 1 "register_operand" "r")
@@ -4112,8 +4794,8 @@
 	  (match_operand:DI 1 "register_operand" "r")
 	  (match_operator 4 "subreg_lowpart_operator"
 	   [(and:SI (match_operand:SI 2 "register_operand" "r")
-		     (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
-  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
+		    (match_operand 3 "const_int_operand" "n"))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode) - 1)) == 0)"
 {
   rtx xop[3];
   xop[0] = operands[0];
@@ -4125,7 +4807,7 @@
   [(set_attr "type" "shift_reg")]
 )
 
-(define_insn_and_split "*aarch64_reg_<optab>_minus<mode>3"
+(define_insn_and_split "*aarch64_<optab>_reg_minus<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=&r")
 	(ASHIFT:GPI
 	  (match_operand:GPI 1 "register_operand" "r")
@@ -4166,8 +4848,8 @@
    lsl\t%<w>0, %<w>1, %<w>2
    shl\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    ushl\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>"
-  [(set_attr "simd" "no,no,yes,yes")
-   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")]
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>, neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd")]
 )
 
 ;; Logical right shift using SISD or Integer instruction
@@ -4175,7 +4857,8 @@
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
 	(lshiftrt:GPI
 	 (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
-	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>" "Us<cmode>,r,Us<cmode>,w,0")))]
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_<mode>"
+			      "Us<cmode>,r,Us<cmode_simd>,w,0")))]
   ""
   "@
    lsr\t%<w>0, %<w>1, %2
@@ -4183,8 +4866,8 @@
    ushr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    #
    #"
-  [(set_attr "simd" "no,no,yes,yes,yes")
-   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd")]
 )
 
 (define_split
@@ -4220,9 +4903,10 @@
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
-        (ashiftrt:GPI
-          (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
-          (match_operand:QI 2 "aarch64_reg_or_shift_imm_di" "Us<cmode>,r,Us<cmode>,w,0")))]
+	(ashiftrt:GPI
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
+			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
   ""
   "@
    asr\t%<w>0, %<w>1, %2
@@ -4230,8 +4914,8 @@
    sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
    #
    #"
-  [(set_attr "simd" "no,no,yes,yes,yes")
-   (set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")]
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd")]
 )
 
 (define_split
@@ -4271,8 +4955,7 @@
                    UNSPEC_SISD_USHL))]
   "TARGET_SIMD"
   "ushl\t%d0, %d1, %d2"
-  [(set_attr "simd" "yes")
-   (set_attr "type" "neon_shift_reg")]
+  [(set_attr "type" "neon_shift_reg")]
 )
 
 (define_insn "*aarch64_ushl_2s"
@@ -4282,8 +4965,7 @@
                    UNSPEC_USHL_2S))]
   "TARGET_SIMD"
   "ushl\t%0.2s, %1.2s, %2.2s"
-  [(set_attr "simd" "yes")
-   (set_attr "type" "neon_shift_reg")]
+  [(set_attr "type" "neon_shift_reg")]
 )
 
 (define_insn "*aarch64_sisd_sshl"
@@ -4293,8 +4975,7 @@
                    UNSPEC_SISD_SSHL))]
   "TARGET_SIMD"
   "sshl\t%d0, %d1, %d2"
-  [(set_attr "simd" "yes")
-   (set_attr "type" "neon_shift_reg")]
+  [(set_attr "type" "neon_shift_reg")]
 )
 
 (define_insn "*aarch64_sshl_2s"
@@ -4304,8 +4985,7 @@
                    UNSPEC_SSHL_2S))]
   "TARGET_SIMD"
   "sshl\t%0.2s, %1.2s, %2.2s"
-  [(set_attr "simd" "yes")
-   (set_attr "type" "neon_shift_reg")]
+  [(set_attr "type" "neon_shift_reg")]
 )
 
 (define_insn "*aarch64_sisd_neg_qi"
@@ -4314,8 +4994,7 @@
                    UNSPEC_SISD_NEG))]
   "TARGET_SIMD"
   "neg\t%d0, %d1"
-  [(set_attr "simd" "yes")
-   (set_attr "type" "neon_neg")]
+  [(set_attr "type" "neon_neg")]
 )
 
 ;; Rotate right
@@ -4621,6 +5300,20 @@
   [(set_attr "type" "bfx")]
 )
 
+;; Match sbfiz pattern in a shift left + shift right operation.
+
+(define_insn "*ashift<mode>_extv_bfiz"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(ashift:GPI (sign_extract:GPI (match_operand:GPI 1 "register_operand" "r")
+				      (match_operand 2 "aarch64_simd_shift_imm_offset_<mode>" "n")
+				      (const_int 0))
+		     (match_operand 3 "aarch64_simd_shift_imm_<mode>" "n")))]
+  "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]),
+	     1, GET_MODE_BITSIZE (<MODE>mode) - 1)"
+  "sbfiz\\t%<w>0, %<w>1, %3, %2"
+  [(set_attr "type" "bfx")]
+)
+
 ;; When the bit position and width of the equivalent extraction add up to 32
 ;; we can use a W-reg LSL instruction taking advantage of the implicit
 ;; zero-extension of the X-reg.
@@ -4655,6 +5348,58 @@
   [(set_attr "type" "rev")]
 )
 
+(define_insn "*aarch64_bfxil<mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r")
+    (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r,0")
+		    (match_operand:GPI 3 "const_int_operand" "n, Ulc"))
+	    (and:GPI (match_operand:GPI 2 "register_operand" "0,r")
+		    (match_operand:GPI 4 "const_int_operand" "Ulc, n"))))]
+  "(INTVAL (operands[3]) == ~INTVAL (operands[4]))
+  && (aarch64_high_bits_all_ones_p (INTVAL (operands[3]))
+    || aarch64_high_bits_all_ones_p (INTVAL (operands[4])))"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[3])));
+	return "bfxil\\t%<w>0, %<w>1, 0, %3";
+      case 1:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[4])));
+	return "bfxil\\t%<w>0, %<w>2, 0, %3";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfm")]
+)
+
+; Zero-extended version of above (aarch64_bfxil)
+(define_insn "*aarch64_bfxilsi_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI (ior:SI (and:SI (match_operand:SI 1 "register_operand"
+					"r,0")
+		    (match_operand:SI 3 "const_int_operand" "n, Ulc"))
+	    (and:SI (match_operand:SI 2 "register_operand" "0,r")
+		    (match_operand:SI 4 "const_int_operand" "Ulc, n")))))]
+  "(INTVAL (operands[3]) == ~INTVAL (operands[4]))
+  && (aarch64_high_bits_all_ones_p (INTVAL (operands[3]))
+    || aarch64_high_bits_all_ones_p (INTVAL (operands[4])))"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[3])));
+	return "bfxil\\t%0, %1, 0, %3";
+      case 1:
+	operands[3] = GEN_INT (ctz_hwi (~INTVAL (operands[4])));
+	return "bfxil\\t%0, %2, 0, %3";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfm")]
+)
+
 ;; There are no canonicalisation rules for the position of the lshiftrt, ashift
 ;; operations within an IOR/AND RTX, therefore we have two patterns matching
 ;; each valid permutation.
@@ -4743,57 +5488,94 @@
   [(set_attr "type" "f_cvtf2i")]
 )
 
-;; fma - no throw
-
-(define_insn "fma<mode>4"
+;; fma - expand fma into patterns with the accumulator operand first since
+;; reusing the accumulator results in better register allocation.
+;; The register allocator considers copy preferences in operand order,
+;; so this prefers fmadd s0, s1, s2, s0 over fmadd s1, s1, s2, s0.
+
+(define_expand "fma<mode>4"
+  [(set (match_operand:GPF_F16 0 "register_operand")
+	(fma:GPF_F16 (match_operand:GPF_F16 1 "register_operand")
+		     (match_operand:GPF_F16 2 "register_operand")
+		     (match_operand:GPF_F16 3 "register_operand")))]
+  "TARGET_FLOAT"
+)
+
+(define_insn "*aarch64_fma<mode>4"
   [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-        (fma:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")
-		     (match_operand:GPF_F16 2 "register_operand" "w")
-		     (match_operand:GPF_F16 3 "register_operand" "w")))]
+	(fma:GPF_F16 (match_operand:GPF_F16 2 "register_operand" "w")
+		     (match_operand:GPF_F16 3 "register_operand" "w")
+		     (match_operand:GPF_F16 1 "register_operand" "w")))]
   "TARGET_FLOAT"
-  "fmadd\\t%<s>0, %<s>1, %<s>2, %<s>3"
+  "fmadd\\t%<s>0, %<s>2, %<s>3, %<s>1"
   [(set_attr "type" "fmac<stype>")]
 )
 
-(define_insn "fnma<mode>4"
+(define_expand "fnma<mode>4"
+  [(set (match_operand:GPF_F16 0 "register_operand")
+	(fma:GPF_F16
+	  (neg:GPF_F16 (match_operand:GPF_F16 1 "register_operand"))
+	  (match_operand:GPF_F16 2 "register_operand")
+	  (match_operand:GPF_F16 3 "register_operand")))]
+  "TARGET_FLOAT"
+)
+
+(define_insn "*aarch64_fnma<mode>4"
   [(set (match_operand:GPF_F16 0 "register_operand" "=w")
 	(fma:GPF_F16
-	  (neg:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w"))
-	  (match_operand:GPF_F16 2 "register_operand" "w")
-	  (match_operand:GPF_F16 3 "register_operand" "w")))]
+	  (neg:GPF_F16 (match_operand:GPF_F16 2 "register_operand" "w"))
+	  (match_operand:GPF_F16 3 "register_operand" "w")
+	  (match_operand:GPF_F16 1 "register_operand" "w")))]
   "TARGET_FLOAT"
-  "fmsub\\t%<s>0, %<s>1, %<s>2, %<s>3"
+  "fmsub\\t%<s>0, %<s>2, %<s>3, %<s>1"
   [(set_attr "type" "fmac<stype>")]
 )
 
-(define_insn "fms<mode>4"
+
+(define_expand "fms<mode>4"
+  [(set (match_operand:GPF 0 "register_operand")
+	(fma:GPF (match_operand:GPF 1 "register_operand")
+		 (match_operand:GPF 2 "register_operand")
+		 (neg:GPF (match_operand:GPF 3 "register_operand"))))]
+  "TARGET_FLOAT"
+)
+
+(define_insn "*aarch64_fms<mode>4"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-        (fma:GPF (match_operand:GPF 1 "register_operand" "w")
-		 (match_operand:GPF 2 "register_operand" "w")
-		 (neg:GPF (match_operand:GPF 3 "register_operand" "w"))))]
+	(fma:GPF (match_operand:GPF 2 "register_operand" "w")
+		 (match_operand:GPF 3 "register_operand" "w")
+		 (neg:GPF (match_operand:GPF 1 "register_operand" "w"))))]
   "TARGET_FLOAT"
-  "fnmsub\\t%<s>0, %<s>1, %<s>2, %<s>3"
+  "fnmsub\\t%<s>0, %<s>2, %<s>3, %<s>1"
   [(set_attr "type" "fmac<s>")]
 )
 
-(define_insn "fnms<mode>4"
+(define_expand "fnms<mode>4"
+  [(set (match_operand:GPF 0 "register_operand")
+	(fma:GPF (neg:GPF (match_operand:GPF 1 "register_operand"))
+		 (match_operand:GPF 2 "register_operand")
+		 (neg:GPF (match_operand:GPF 3 "register_operand"))))]
+  "TARGET_FLOAT"
+)
+
+(define_insn "*aarch64_fnms<mode>4"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-	(fma:GPF (neg:GPF (match_operand:GPF 1 "register_operand" "w"))
-		 (match_operand:GPF 2 "register_operand" "w")
-		 (neg:GPF (match_operand:GPF 3 "register_operand" "w"))))]
+	(fma:GPF (neg:GPF (match_operand:GPF 2 "register_operand" "w"))
+		 (match_operand:GPF 3 "register_operand" "w")
+		 (neg:GPF (match_operand:GPF 1 "register_operand" "w"))))]
   "TARGET_FLOAT"
-  "fnmadd\\t%<s>0, %<s>1, %<s>2, %<s>3"
+  "fnmadd\\t%<s>0, %<s>2, %<s>3, %<s>1"
   [(set_attr "type" "fmac<s>")]
 )
 
 ;; If signed zeros are ignored, -(a * b + c) = -a * b - c.
-(define_insn "*fnmadd<mode>4"
+(define_insn "*aarch64_fnmadd<mode>4"
   [(set (match_operand:GPF 0 "register_operand" "=w")
-	(neg:GPF (fma:GPF (match_operand:GPF 1 "register_operand" "w")
-			  (match_operand:GPF 2 "register_operand" "w")
-			  (match_operand:GPF 3 "register_operand" "w"))))]
+	(neg:GPF (fma:GPF (match_operand:GPF 2 "register_operand" "w")
+			  (match_operand:GPF 3 "register_operand" "w")
+			  (match_operand:GPF 1 "register_operand" "w"))))]
   "!HONOR_SIGNED_ZEROS (<MODE>mode) && TARGET_FLOAT"
-  "fnmadd\\t%<s>0, %<s>1, %<s>2, %<s>3"
+  "fnmadd\\t%<s>0, %<s>2, %<s>3, %<s>1"
   [(set_attr "type" "fmac<s>")]
 )
 
@@ -4849,24 +5631,59 @@
   [(set_attr "type" "f_cvt")]
 )
 
-(define_insn "<optab>_trunc<GPF_F16:mode><GPI:mode>2"
+;; Convert SF -> SI or DF -> DI while preferring w = w register constraints
+;; and making r = w more expensive
+
+(define_insn "<optab>_trunc<fcvt_target><GPI:mode>2"
+  [(set (match_operand:GPI 0 "register_operand" "=w,?r")
+	(FIXUORS:GPI (match_operand:<FCVT_TARGET> 1 "register_operand" "w,w")))]
+  "TARGET_FLOAT"
+  "@
+   fcvtz<su>\t%<s>0, %<s>1
+   fcvtz<su>\t%<w>0, %<s>1"
+  [(set_attr "type" "neon_fp_to_int_s,f_cvtf2i")]
+)
+
+;; Convert HF -> SI or DI
+
+(define_insn "<optab>_trunchf<GPI:mode>2"
   [(set (match_operand:GPI 0 "register_operand" "=r")
-	(FIXUORS:GPI (match_operand:GPF_F16 1 "register_operand" "w")))]
+	(FIXUORS:GPI (match_operand:HF 1 "register_operand" "w")))]
+  "TARGET_FP_F16INST"
+  "fcvtz<su>\t%<w>0, %h1"
+  [(set_attr "type" "f_cvtf2i")]
+)
+
+;; Convert DF -> SI or SF -> DI which can only be accomplished with
+;; input in a fp register and output in a integer register
+
+(define_insn "<optab>_trunc<fcvt_change_mode><GPI:mode>2"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(FIXUORS:GPI (match_operand:<FCVT_CHANGE_MODE> 1 "register_operand" "w")))]
   "TARGET_FLOAT"
-  "fcvtz<su>\t%<GPI:w>0, %<GPF_F16:s>1"
+  "fcvtz<su>\t%<w>0, %<fpw>1"
+  [(set_attr "type" "f_cvtf2i")]
+)
+
+(define_insn "*fix_to_zero_extend<mode>di2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	 (unsigned_fix:SI
+	  (match_operand:GPF 1 "register_operand" "w"))))]
+  "TARGET_FLOAT"
+  "fcvtzu\t%w0, %<s>1"
   [(set_attr "type" "f_cvtf2i")]
 )
 
 (define_insn "<optab><fcvt_target><GPF:mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w,w")
-        (FLOATUORS:GPF (match_operand:<FCVT_TARGET> 1 "register_operand" "w,r")))]
+        (FLOATUORS:GPF (match_operand:<FCVT_TARGET> 1 "register_operand" "w,?r")))]
   "TARGET_FLOAT"
   "@
    <su_optab>cvtf\t%<GPF:s>0, %<s>1
    <su_optab>cvtf\t%<GPF:s>0, %<w1>1"
-  [(set_attr "simd" "yes,no")
-   (set_attr "fp" "no,yes")
-   (set_attr "type" "neon_int_to_fp_<Vetype>,f_cvti2f")]
+  [(set_attr "type" "neon_int_to_fp_<Vetype>,f_cvti2f")
+   (set_attr "arch" "simd,fp")]
 )
 
 (define_insn "<optab><fcvt_iesize><GPF:mode>2"
@@ -4951,8 +5768,7 @@
    <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:w1>0, %<GPF:s>1, #%2
    <FCVT_F2FIXED:fcvt_fixed_insn>\t%<GPF:s>0, %<GPF:s>1, #%2"
   [(set_attr "type" "f_cvtf2i, neon_fp_to_int_<GPF:Vetype>")
-   (set_attr "fp" "yes, *")
-   (set_attr "simd" "*, yes")]
+   (set_attr "arch" "fp,simd")]
 )
 
 (define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
@@ -4965,8 +5781,7 @@
    <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:w>1, #%2
    <FCVT_FIXED2F:fcvt_fixed_insn>\t%<GPI:v>0, %<GPI:v>1, #%2"
   [(set_attr "type" "f_cvti2f, neon_int_to_fp_<GPI:Vetype>")
-   (set_attr "fp" "yes, *")
-   (set_attr "simd" "*, yes")]
+   (set_attr "arch" "fp,simd")]
 )
 
 (define_insn "<FCVT_F2FIXED:fcvt_fixed_insn>hf<mode>3"
@@ -5067,7 +5882,7 @@
  [(set (match_operand:GPF_F16 0 "register_operand")
        (div:GPF_F16 (match_operand:GPF_F16 1 "general_operand")
 		    (match_operand:GPF_F16 2 "register_operand")))]
- "TARGET_SIMD"
+ "TARGET_FLOAT"
 {
   if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
     DONE;
@@ -5255,7 +6070,7 @@
 ;; -------------------------------------------------------------------
 ;; Reload Scalar Floating point modes from constant pool.
 ;; The AArch64 port doesn't have __int128 constant move support.
-(define_expand "aarch64_reload_movcp<GPF_TF:mode><P:mode>"
+(define_expand "@aarch64_reload_movcp<GPF_TF:mode><P:mode>"
  [(set (match_operand:GPF_TF 0 "register_operand" "=w")
        (mem:GPF_TF (match_operand 1 "aarch64_constant_pool_symref" "S")))
   (clobber (match_operand:P 2 "register_operand" "=&r"))]
@@ -5268,7 +6083,7 @@
 )
 
 ;; Reload Vector modes from constant pool.
-(define_expand "aarch64_reload_movcp<VALL:mode><P:mode>"
+(define_expand "@aarch64_reload_movcp<VALL:mode><P:mode>"
  [(set (match_operand:VALL 0 "register_operand" "=w")
        (mem:VALL (match_operand 1 "aarch64_constant_pool_symref" "S")))
   (clobber (match_operand:P 2 "register_operand" "=&r"))]
@@ -5280,7 +6095,7 @@
  }
 )
 
-(define_expand "aarch64_reload_mov<mode>"
+(define_expand "@aarch64_reload_mov<mode>"
   [(set (match_operand:TX 0 "register_operand" "=w")
         (match_operand:TX 1 "register_operand" "w"))
    (clobber (match_operand:DI 2 "register_operand" "=&r"))
@@ -5300,7 +6115,7 @@
 ;; after or during reload as we don't want these patterns to start
 ;; kicking in during the combiner.
 
-(define_insn "aarch64_movdi_<mode>low"
+(define_insn "@aarch64_movdi_<mode>low"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extract:DI (match_operand:TX 1 "register_operand" "w")
 			 (const_int 64) (const_int 0)))]
@@ -5310,7 +6125,7 @@
    (set_attr "length" "4")
   ])
 
-(define_insn "aarch64_movdi_<mode>high"
+(define_insn "@aarch64_movdi_<mode>high"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(zero_extract:DI (match_operand:TX 1 "register_operand" "w")
 			 (const_int 64) (const_int 64)))]
@@ -5320,7 +6135,7 @@
    (set_attr "length" "4")
   ])
 
-(define_insn "aarch64_mov<mode>high_di"
+(define_insn "@aarch64_mov<mode>high_di"
   [(set (zero_extract:TX (match_operand:TX 0 "register_operand" "+w")
                          (const_int 64) (const_int 64))
         (zero_extend:TX (match_operand:DI 1 "register_operand" "r")))]
@@ -5330,7 +6145,7 @@
    (set_attr "length" "4")
   ])
 
-(define_insn "aarch64_mov<mode>low_di"
+(define_insn "@aarch64_mov<mode>low_di"
   [(set (match_operand:TX 0 "register_operand" "=w")
         (zero_extend:TX (match_operand:DI 1 "register_operand" "r")))]
   "TARGET_FLOAT && (reload_completed || reload_in_progress)"
@@ -5375,7 +6190,7 @@
 	(lo_sum:P (match_operand:P 1 "register_operand" "r")
 		  (match_operand 2 "aarch64_valid_symref" "S")))]
   ""
-  "add\\t%<w>0, %<w>1, :lo12:%a2"
+  "add\\t%<w>0, %<w>1, :lo12:%c2"
   [(set_attr "type" "alu_imm")]
 )
 
@@ -5386,7 +6201,7 @@
 			      (match_operand:PTR 2 "aarch64_valid_symref" "S")))]
 		    UNSPEC_GOTSMALLPIC))]
   ""
-  "ldr\\t%<w>0, [%1, #:got_lo12:%a2]"
+  "ldr\\t%<w>0, [%1, #:got_lo12:%c2]"
   [(set_attr "type" "load_<ldst_sz>")]
 )
 
@@ -5398,7 +6213,7 @@
 			     (match_operand:DI 2 "aarch64_valid_symref" "S")))]
 		    UNSPEC_GOTSMALLPIC)))]
   "TARGET_ILP32"
-  "ldr\\t%w0, [%1, #:got_lo12:%a2]"
+  "ldr\\t%w0, [%1, #:got_lo12:%c2]"
   [(set_attr "type" "load_4")]
 )
 
@@ -5409,7 +6224,7 @@
 			      (match_operand:PTR 2 "aarch64_valid_symref" "S")))]
 		    UNSPEC_GOTSMALLPIC28K))]
   ""
-  "ldr\\t%<w>0, [%1, #:<got_modifier>:%a2]"
+  "ldr\\t%<w>0, [%1, #:<got_modifier>:%c2]"
   [(set_attr "type" "load_<ldst_sz>")]
 )
 
@@ -5421,7 +6236,7 @@
 			     (match_operand:DI 2 "aarch64_valid_symref" "S")))]
 		    UNSPEC_GOTSMALLPIC28K)))]
   "TARGET_ILP32"
-  "ldr\\t%w0, [%1, #:gotpage_lo14:%a2]"
+  "ldr\\t%w0, [%1, #:gotpage_lo14:%c2]"
   [(set_attr "type" "load_4")]
 )
 
@@ -5553,14 +6368,91 @@
    (set_attr "length" "12")]
 )
 
-(define_insn "tlsdesc_small_<mode>"
+(define_expand "tlsdesc_small_<mode>"
+  [(unspec:PTR [(match_operand 0 "aarch64_valid_symref")] UNSPEC_TLSDESC)]
+  "TARGET_TLS_DESC"
+  {
+    if (TARGET_SVE)
+      emit_insn (gen_tlsdesc_small_sve_<mode> (operands[0]));
+    else
+      emit_insn (gen_tlsdesc_small_advsimd_<mode> (operands[0]));
+    DONE;
+  }
+)
+
+;; tlsdesc calls preserve all core and Advanced SIMD registers except
+;; R0 and LR.
+(define_insn "tlsdesc_small_advsimd_<mode>"
   [(set (reg:PTR R0_REGNUM)
         (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
-		   UNSPEC_TLSDESC))
+		    UNSPEC_TLSDESC))
    (clobber (reg:DI LR_REGNUM))
    (clobber (reg:CC CC_REGNUM))
    (clobber (match_scratch:DI 1 "=r"))]
-  "TARGET_TLS_DESC"
+  "TARGET_TLS_DESC && !TARGET_SVE"
+  "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
+  [(set_attr "type" "call")
+   (set_attr "length" "16")])
+
+;; For SVE, model tlsdesc calls as clobbering the lower 128 bits of
+;; all vector registers, and clobber all predicate registers, on
+;; top of the usual R0 and LR.
+(define_insn "tlsdesc_small_sve_<mode>"
+  [(set (reg:PTR R0_REGNUM)
+        (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
+		    UNSPEC_TLSDESC))
+   (clobber (reg:DI LR_REGNUM))
+   (clobber (reg:CC CC_REGNUM))
+   (clobber_high (reg:TI V0_REGNUM))
+   (clobber_high (reg:TI V1_REGNUM))
+   (clobber_high (reg:TI V2_REGNUM))
+   (clobber_high (reg:TI V3_REGNUM))
+   (clobber_high (reg:TI V4_REGNUM))
+   (clobber_high (reg:TI V5_REGNUM))
+   (clobber_high (reg:TI V6_REGNUM))
+   (clobber_high (reg:TI V7_REGNUM))
+   (clobber_high (reg:TI V8_REGNUM))
+   (clobber_high (reg:TI V9_REGNUM))
+   (clobber_high (reg:TI V10_REGNUM))
+   (clobber_high (reg:TI V11_REGNUM))
+   (clobber_high (reg:TI V12_REGNUM))
+   (clobber_high (reg:TI V13_REGNUM))
+   (clobber_high (reg:TI V14_REGNUM))
+   (clobber_high (reg:TI V15_REGNUM))
+   (clobber_high (reg:TI V16_REGNUM))
+   (clobber_high (reg:TI V17_REGNUM))
+   (clobber_high (reg:TI V18_REGNUM))
+   (clobber_high (reg:TI V19_REGNUM))
+   (clobber_high (reg:TI V20_REGNUM))
+   (clobber_high (reg:TI V21_REGNUM))
+   (clobber_high (reg:TI V22_REGNUM))
+   (clobber_high (reg:TI V23_REGNUM))
+   (clobber_high (reg:TI V24_REGNUM))
+   (clobber_high (reg:TI V25_REGNUM))
+   (clobber_high (reg:TI V26_REGNUM))
+   (clobber_high (reg:TI V27_REGNUM))
+   (clobber_high (reg:TI V28_REGNUM))
+   (clobber_high (reg:TI V29_REGNUM))
+   (clobber_high (reg:TI V30_REGNUM))
+   (clobber_high (reg:TI V31_REGNUM))
+   (clobber (reg:VNx2BI P0_REGNUM))
+   (clobber (reg:VNx2BI P1_REGNUM))
+   (clobber (reg:VNx2BI P2_REGNUM))
+   (clobber (reg:VNx2BI P3_REGNUM))
+   (clobber (reg:VNx2BI P4_REGNUM))
+   (clobber (reg:VNx2BI P5_REGNUM))
+   (clobber (reg:VNx2BI P6_REGNUM))
+   (clobber (reg:VNx2BI P7_REGNUM))
+   (clobber (reg:VNx2BI P8_REGNUM))
+   (clobber (reg:VNx2BI P9_REGNUM))
+   (clobber (reg:VNx2BI P10_REGNUM))
+   (clobber (reg:VNx2BI P11_REGNUM))
+   (clobber (reg:VNx2BI P12_REGNUM))
+   (clobber (reg:VNx2BI P13_REGNUM))
+   (clobber (reg:VNx2BI P14_REGNUM))
+   (clobber (reg:VNx2BI P15_REGNUM))
+   (clobber (match_scratch:DI 1 "=r"))]
+  "TARGET_TLS_DESC && TARGET_SVE"
   "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
   [(set_attr "type" "call")
    (set_attr "length" "16")])
@@ -5620,7 +6512,7 @@
 )
 
 (define_insn "probe_stack_range"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand" "=rk")
 	(unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0")
 			     (match_operand:DI 2 "register_operand" "r")]
 			      UNSPECV_PROBE_STACK_RANGE))]
@@ -5631,6 +6523,25 @@
   [(set_attr "length" "32")]
 )
 
+;; This instruction is used to generate the stack clash stack adjustment and
+;; probing loop.  We can't change the control flow during prologue and epilogue
+;; code generation.  So we must emit a volatile unspec and expand it later on.
+
+(define_insn "@probe_sve_stack_clash_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=rk")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "register_operand" "r")
+			    (match_operand:P 3 "const_int_operand" "n")
+			    (match_operand:P 4 "aarch64_plus_immediate" "L")]
+			     UNSPECV_PROBE_STACK_RANGE))]
+  "TARGET_SVE"
+{
+  return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
+					       operands[3], operands[4]);
+}
+  [(set_attr "length" "28")]
+)
+
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
   [(match_operand:DI 0 "register_operand" "=r")]
@@ -5771,6 +6682,134 @@
   DONE;
 })
 
+;; Track speculation through conditional branches.  We assume that
+;; SPECULATION_TRACKER_REGNUM is reserved for this purpose when necessary.
+(define_insn "speculation_tracker"
+  [(set (reg:DI SPECULATION_TRACKER_REGNUM)
+	(unspec [(reg:DI SPECULATION_TRACKER_REGNUM) (match_operand 0)]
+	 UNSPEC_SPECULATION_TRACKER))]
+  ""
+  {
+    operands[1] = gen_rtx_REG (DImode, SPECULATION_TRACKER_REGNUM);
+    output_asm_insn ("csel\\t%1, %1, xzr, %m0", operands);
+    return "";
+  }
+  [(set_attr "type" "csel")]
+)
+
+;; Helper for aarch64.c code.
+(define_expand "set_clobber_cc"
+  [(parallel [(set (match_operand 0)
+		   (match_operand 1))
+	      (clobber (reg:CC CC_REGNUM))])])
+
+;; Hard speculation barrier.
+(define_insn "speculation_barrier"
+  [(unspec_volatile [(const_int 0)] UNSPECV_SPECULATION_BARRIER)]
+  ""
+  "isb\;dsb\\tsy"
+  [(set_attr "length" "8")
+   (set_attr "type" "block")
+   (set_attr "speculation_barrier" "true")]
+)
+
+;; Support for __builtin_speculation_safe_value when we have speculation
+;; tracking enabled.  Use the speculation tracker to decide whether to
+;; copy operand 1 to the target, or to copy the fail value (operand 2).
+(define_expand "@despeculate_copy<ALLI_TI:mode>"
+  [(set (match_operand:ALLI_TI 0 "register_operand" "=r")
+	(unspec_volatile:ALLI_TI
+	 [(match_operand:ALLI_TI 1 "register_operand" "r")
+	  (match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
+	  (use (reg:DI SPECULATION_TRACKER_REGNUM))
+	  (clobber (reg:CC CC_REGNUM))] UNSPECV_SPECULATION_BARRIER))]
+  ""
+  "
+  {
+    if (operands[2] == const0_rtx)
+      {
+	rtx tracker;
+	if (<MODE>mode == TImode)
+	  tracker = gen_rtx_REG (DImode, SPECULATION_TRACKER_REGNUM);
+	else
+	  tracker = gen_rtx_REG (<MODE>mode, SPECULATION_TRACKER_REGNUM);
+
+	emit_insn (gen_despeculate_simple<mode> (operands[0], operands[1],
+						 tracker));
+	DONE;
+      }
+  }
+  "
+)
+
+;; Patterns to match despeculate_copy<mode>.  Note that "hint 0x14" is the
+;; encoding for CSDB, but will work in older versions of the assembler.
+(define_insn "*despeculate_copy<ALLI:mode>_insn"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
+	(unspec_volatile:ALLI
+	 [(match_operand:ALLI 1 "register_operand" "r")
+	  (match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
+	  (use (reg:DI SPECULATION_TRACKER_REGNUM))
+	  (clobber (reg:CC CC_REGNUM))] UNSPECV_SPECULATION_BARRIER))]
+  ""
+  {
+    operands[3] = gen_rtx_REG (DImode, SPECULATION_TRACKER_REGNUM);
+    output_asm_insn ("cmp\\t%3, #0\;csel\\t%<w>0, %<w>1, %<w>2, ne\;hint\t0x14 // csdb",
+		     operands);
+    return "";
+  }
+  [(set_attr "length" "12")
+   (set_attr "type" "block")
+   (set_attr "speculation_barrier" "true")]
+)
+
+;; Pattern to match despeculate_copyti
+(define_insn "*despeculate_copyti_insn"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+	(unspec_volatile:TI
+	 [(match_operand:TI 1 "register_operand" "r")
+	  (match_operand:TI 2 "aarch64_reg_or_zero" "rZ")
+	  (use (reg:DI SPECULATION_TRACKER_REGNUM))
+	  (clobber (reg:CC CC_REGNUM))] UNSPECV_SPECULATION_BARRIER))]
+  ""
+  {
+    operands[3] = gen_rtx_REG (DImode, SPECULATION_TRACKER_REGNUM);
+    output_asm_insn
+      ("cmp\\t%3, #0\;csel\\t%0, %1, %2, ne\;csel\\t%H0, %H1, %H2, ne\;hint\t0x14 // csdb",
+       operands);
+    return "";
+  }
+  [(set_attr "length" "16")
+   (set_attr "type" "block")
+   (set_attr "speculation_barrier" "true")]
+)
+
+(define_insn "despeculate_simple<ALLI:mode>"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
+	(unspec_volatile:ALLI
+	 [(match_operand:ALLI 1 "register_operand" "r")
+	  (use (match_operand:ALLI 2 "register_operand" ""))]
+	 UNSPECV_SPECULATION_BARRIER))]
+  ""
+  "and\\t%<w>0, %<w>1, %<w>2\;hint\t0x14 // csdb"
+  [(set_attr "type" "block")
+   (set_attr "length" "8")
+   (set_attr "speculation_barrier" "true")]
+)
+
+(define_insn "despeculate_simpleti"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+	(unspec_volatile:TI
+	 [(match_operand:TI 1 "register_operand" "r")
+	  (use (match_operand:DI 2 "register_operand" ""))]
+	 UNSPECV_SPECULATION_BARRIER))]
+  ""
+  "and\\t%0, %1, %2\;and\\t%H0, %H1, %2\;hint\t0x14 // csdb"
+  [(set_attr "type" "block")
+   (set_attr "length" "12")
+   (set_attr "speculation_barrier" "true")]
+)
+
 ;; AdvSIMD Stuff
 (include "aarch64-simd.md")
 
@@ -5779,3 +6818,6 @@
 
 ;; ldp/stp peephole patterns
 (include "aarch64-ldpstp.md")
+
+;; SVE.
+(include "aarch64-sve.md")